Package Loading and Data Importing

Load Packages and Custom Functions

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

if (!require("remotes", quietly = TRUE))
    install.packages("remotes")

if (!require("pacman", quietly = TRUE))
    install.packages("pacman")

if (!require("gtsummary", quietly = TRUE))
  remotes::install_github("ddsjoberg/gtsummary")

if (!require("mixOmics", quietly = TRUE))
  BiocManager::install("mixOmicsTeam/mixOmics")

if (!require("yingtools2", quietly = TRUE))
  remotes::install_github("ying14/yingtools2")

if (!require("ggpirate", quietly = TRUE))
  remotes::install_github("mikabr/ggpirate")

if (!require("paletteer", quietly = TRUE))
  remotes::install_github("EmilHvitfeldt/paletteer")

if (!require("microbiomeMarker", quietly = TRUE))
  BiocManager::install("microbiomeMarker")

if (!require("magick", quietly = TRUE))
  install.packages("magick")

if (!require("mltest", quietly = TRUE))
  install.packages("mltest")

if (!require("biostatUtil", quietly = TRUE))
  remotes::install_github("talhouklab/biostatUtil")

library(pacman)
p_load(
  factoextra,               # Dimension reduction
  FactoMineR,               # Dimension reduction
  stabs,                    # Stability variable selection
  mboost,                   # Gradient boosting for model building
  ggrepel,                  # Visualization, repels labels on plots
  bestglm,                  # Logistic regression
  knitr,                    # To change R markdown PDF options
  caret,                    # To calculate model paramters
  cutpointr,                # Calculate cutpoints
  RPostgreSQL,              # Connect to PostgreSQL server
  phyloseq,                 # Phyloseq data wrangling
  # microbiomeMarker,         # LEfSe analysis
  ComplexHeatmap,           # Heatmap generator
  paletteer,                # Extensive color palette
  tidyverse,                # Data wrangling and visualization
  circlize,                 # Color ramp builder
  rstatix,                  # Tidyverse statistics
  EnhancedVolcano,          # Volcano plot analysis
  umap,                     # Uniform Manifold Approximation Projection
  ggpubr,                   # Simple ggplots with stats
  ggpirate,                 # ggplot version of Pirate plot
  mixOmics,                 # PLS-DA
  conflicted,               # Forces conflicted packages to be used by preferred package
  gridExtra,                # Arrange multiple grobs
  reticulate,               # Run python in R
  tableone,                 # Additional clinical tables support
  lubridate,                # Manipulate date objects
  yingtools2,               # Custom plotting functions
  cowplot,                  # Combine plots together
  epiR,                     # Obtain model performance measures
  pROC,                     # Produce ROC curves
  plotrix,                  # Add table to base R plot
  glmnet,                   # LASSO regression
  forcats,                  # Factor reordering
  ggpmisc,                  # Miscellaneous ggplot helper functions
  doParallel,               # Parallelize functions (for optimizing cutpoints)
  PRISMAstatement,          # Flow table generator
  DiagrammeRsvg,            # Flow table visualizer
  rsvg,                     # Flow table visualizer
  scales,                   # Scale functions for visualizations
  devtools,                 # To load packages from GitHub
  # datscience,               # Bibliography manager
  formatR,                  # Markdown code formatter
  openxlsx,                 # Excel worksheet manipulator
  biostatUtil,              # Mainly used for multi-level confusion matrix confidence intervals
  irr,                      # Dependency package
  rJava,                    # Dependency package
  install = FALSE
)

library("gtsummary")  # Clinical tables, pacman had trouble loading this in, so had to go the manual route

{conflict_prefer("select", "dplyr")
conflict_prefer("mutate", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("rename", "dplyr")
conflict_prefer("slice", "dplyr")
conflict_prefer("between", "dplyr")
conflict_prefer("annotate", "ggplot2")}

devtools::source_url("https://github.com/yingeddi2008/DFIutility/blob/master/getRdpPal.R?raw=TRUE")

# ggplot theme shortcuts
et <- element_text
eb <- element_blank
er <- element_rect
el <- element_line

opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)

`%!in%` <- negate(`%in%`)

# Running python in R
use_condaenv("base", required = TRUE)

Import Data

Load in all Metagenomic, Metabolomic and Clinical Data

# 1) Load R image
load("./Data/LT_Modeling.RData")

# # OR # # 2) Individually Load R Objects # Sample lookup
# sample_lookup <- readRDS('./Data/sample_lookup.rds') #
# First samples from all patients first_samps <-
# readRDS('./Data/first_samps_anon.rds') # Simplified
# dataframe containing infection information and relative
# abundance of targeted taxa peri_matrix_all <-
# readRDS('./Data/peri_matrix_clin_all.rds') # Distinct
# detailed infection data peri_criteria_best <-
# readRDS('./Data/peri_criteria_best_anon.rds') # All
# detailed infection data peri_criteria_all <-
# readRDS('./Data/peri_criteria_all_anon.rds') # NCBI
# taxonomy lookup tax_lookup <-
# readRDS('./Data/tax_lookup.rds') # Complete metaphlan
# dataframe for all patients and healthy donors
# metaphlan_df <- readRDS('./Data/metaphlan_anon.rds') #
# Metaphlan dataframe of patient samples
# metaphlan_peri_anon <-
# readRDS('./Data/metaphlan_peri_anon.rds') # Custom color
# palette metaphlan_pal <- getRdpPal(metaphlan_df) #
# Qualitative metabolomics metab_qual_anon <-
# readRDS('./Data/metab_qual_anonym.rds') # Quantitative
# metabolomics metab_quant_anon <-
# readRDS('./Data/metab_quant_anonym.rds') Antibiotics data
# abx <- readRDS('./Data/abx_anon.rds') abx <-
# readRDS('./Data/original_cohort_abx.rds') Demographics
# data demo <- readRDS('./Data/demo_anon.rds') # Bile acid
# gene data ba_genes <-
# readRDS('./Data/bile_acid_genes.rds') # CARD genes
# card_dict <- readRDS('./Data/card_dict.rds') card_dict <-
# card_dict %>% filter(dbsource == 'card') %>% mutate(
# AMRGeneFamily = ifelse( AMRGeneFamily == 'glycopeptide
# resistance gene cluster;van ligase', 'glycopeptide
# resistance gene cluster;vanA', AMRGeneFamily ),
# AMRGeneFamily = gsub( pattern = ';', replacement = '\n',
# x = AMRGeneFamily ) ) CARD data card2 <-
# readRDS('./Data/card.rds')

Figures

Optimal Cutpoint Analysis: Relative Abundance of Enterococcus and Enterobacerales

# Custom function to avoid any errors during map
safe_cutpointr <- possibly(.f = cutpointr, otherwise = "Error")

# Calculate the number of cores
no_cores <- detectCores() - 2

# create the cluster for caret to use
cl <- makePSOCKcluster(no_cores)
registerDoParallel(cl)

set.seed(123456)
test_abundance <- peri_matrix_all %>%
    mutate(sampleID = as.factor(sampleID)) %>%
    select(starts_with(c("entero", "esch", "klebs", "citro",
        "rahn", "proteus")), -ends_with("abs_abundance")) %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(enterococcus_infection = ifelse(`Enterococcus faecium` +
        `Enterococcus faecalis` + `Enterococcus avium` >= 1,
        1, 0), enterobacterales_infection = ifelse(`Escherichia coli` +
        `Klebsiella pneumoniae` + `Citrobacter freundii` + `Proteus mirabilis` >=
        1, 1, 0)) %>%
    select(-c(`Enterococcus faecium`, `Enterococcus faecalis`,
        `Enterococcus avium`, `Escherichia coli`, `Klebsiella pneumoniae`,
        `Citrobacter freundii`, `Proteus mirabilis`)) %>%
    pivot_longer(-c(enterococcus_infection, enterobacterales_infection),
        names_to = "variable", values_to = "value") %>%
    pivot_longer(-c(variable, value), names_to = "infection_type",
        values_to = "infection") %>%
    mutate(variable = paste0("Input: ", variable, "\nPredict: ",
        infection_type)) %>%
    group_by(infection_type, variable) %>%
    group_map(~safe_cutpointr(., value, infection, variable,
        method = maximize_metric, metric = youden, pos_class = 1,
        boot_runs = 100, allowParallel = TRUE, na.rm = T), .keep = TRUE)

# to get cutpoint object
test_abundance[4][[1]]  # ecoc rel abd, ecoc infection
test_abundance[1][[1]]  # ebac rel abd, ebacc infection
test_abundance_unnest <- test_abundance %>%
    map_df(as_tibble)

# obtain threshold cutpoints for relative abundance
optimal_cutpoint_rel <- test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    filter(grepl("rel", subgroup)) %>%
    select(subgroup, optimal_cutpoint)

Figure 5B: Enterococcus Infection and ROC

# Expansions and Infections Stats
expan_infx_stats <- peri_matrix_all %>%
    left_join(peri_matrix_all %>%
        select(sampleID, ends_with("infection")) %>%
        ungroup()) %>%
    mutate(ecoc_infx = enterococcus_infection, ecoc_infx = ifelse(ecoc_infx >=
        1, 1, 0), ebac_infx = enterobacterales_infection, ebac_infx = ifelse(ebac_infx >=
        1, 1, 0)) %>%
    select(enterococcus_rel_abundance, enterobacterales_rel_abundance,
        ecoc_infx, ebac_infx) %>%
    summarise(ecoc_expan_above_cutpoint = sum(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2]), ecoc_infx_ecoc_below_cutpoint = sum(ecoc_infx ==
        1 & enterococcus_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ecoc_infx_ecoc_above_cutpoint = sum(ecoc_infx ==
        1 & enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ecoc_noinfx_ecoc_below_cutpoint = sum(ecoc_infx ==
        0 & enterococcus_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ecoc_noinfx_ecoc_above_cutpoint = sum(ecoc_infx ==
        0 & enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2],
        na.rm = T), ebac_expan_above_cutpoint = sum(enterobacterales_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[1]), ebac_infx_ebac_below_cutpoint = sum(ebac_infx ==
        1 & enterobacterales_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T), ebac_infx_ebac_above_cutpoint = sum(ebac_infx ==
        1 & enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T), ebac_noinfx_ebac_below_cutpoint = sum(ebac_infx ==
        0 & enterobacterales_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T), ebac_noinfx_ebac_above_cutpoint = sum(ebac_infx ==
        0 & enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1],
        na.rm = T)) %>%
    rownames_to_column(var = "rowname") %>%
    pivot_longer(-rowname, names_to = "metric", values_to = "count") %>%
    mutate(percent = round((count/107) * 100, 2)) %>%
    select(-rowname)

ecoc_infx_confusion <- expan_infx_stats %>%
    filter(metric %in% c("ecoc_infx_ecoc_below_cutpoint", "ecoc_infx_ecoc_above_cutpoint",
        "ecoc_noinfx_ecoc_below_cutpoint", "ecoc_noinfx_ecoc_above_cutpoint")) %>%
    select(metric, count) %>%
    pivot_wider(names_from = metric, values_from = count)

ecoc_infx_confusion_cnfs <- data.frame(Enterococcus = c("Infection",
    "No Infection"), Expansion = c(ecoc_infx_confusion$ecoc_infx_ecoc_above_cutpoint,
    ecoc_infx_confusion$ecoc_noinfx_ecoc_above_cutpoint), `No Expansion` = c(ecoc_infx_confusion$ecoc_infx_ecoc_below_cutpoint,
    ecoc_infx_confusion$ecoc_noinfx_ecoc_below_cutpoint))

ecoc_infx_confusion_cnfs <- ecoc_infx_confusion_cnfs %>%
    mutate(sensitivity = round(ecoc_infx_confusion_cnfs[1, 2]/(ecoc_infx_confusion_cnfs[1,
        2] + ecoc_infx_confusion_cnfs[1, 3]), 3), specificity = round(ecoc_infx_confusion_cnfs[2,
        3]/(ecoc_infx_confusion_cnfs[2, 3] + ecoc_infx_confusion_cnfs[2,
        2]), 3), odds_ratio = round((ecoc_infx_confusion_cnfs[1,
        2]/ecoc_infx_confusion_cnfs[1, 3])/(ecoc_infx_confusion_cnfs[2,
        2]/ecoc_infx_confusion_cnfs[2, 3]), 3))

fig_5b <- test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    ungroup() %>%
    unnest(roc_curve) %>%
    arrange(desc(AUC)) %>%
    select(-boot) %>%
    mutate(auc_label = paste0("AUC = ", formatC(round(AUC, 3),
        digits = 2, format = "f")), auc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
        ", ", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
            AUC, alpha = 0.05)[3][2, ], 2)), digits = 2, format = "f"),
        "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
            ", ", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
                AUC, alpha = 0.05)[3][2, ], 2)), digits = 2,
                format = "f"), "]")), acc_label = paste0("Accuracy = ",
        formatC(round(acc, 3) * 100, digits = 0, format = "f"),
        "%"), acc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0, format = "f"),
        "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0, format = "f"),
        "%"), "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
            format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
            format = "f"), "%"), "]")), sens_label = paste0("Sensitivity = ",
        formatC(round(sensitivity, 3) * 100, digits = 0, format = "f"),
        "%"), sens_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]")), spec_label = paste0("Specificity = ",
        formatC(round(specificity, 3) * 100, digits = 0, format = "f"),
        "%"), spec_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"))) %>%
    filter(grepl(pattern = "rel_abundance", x = Input)) %>%
    mutate(subgroup = str_to_title(subgroup), subgroup = gsub(pattern = "_",
        replacement = " ", x = subgroup), subgroup = gsub(pattern = "rel abundance",
        replacement = "Relative Abundance (%)", x = subgroup),
        subgroup2 = ifelse(grepl(x = subgroup, pattern = "enterococcus",
            ignore.case = TRUE), "Enterococcus", "Enterobacterales"),
        subgroup2 = factor(subgroup2, levels = c("Enterococcus",
            "Enterobacterales")), odds_label = paste0("OR = ",
            ecoc_infx_confusion_cnfs$odds_ratio[1])) %>%
    filter(subgroup2 == "Enterococcus") %>%
    ggplot(aes(x = fpr, y = tpr, color = subgroup2)) + geom_line(size = 1.2) +
    geom_text(aes(label = auc_label, x = 0.45, y = 0.16), show.legend = F,
        hjust = 0, size = 6) + geom_text(aes(label = acc_label,
    x = 0.45, y = 0.12), show.legend = F, hjust = 0, size = 6) +
    geom_text(aes(label = spec_label, x = 0.45, y = 0.08), show.legend = F,
        hjust = 0, size = 6) + geom_text(aes(label = sens_label,
    x = 0.45, y = 0.04), show.legend = F, hjust = 0, size = 6) +
    geom_text(aes(label = paste0("Cutpoint = ", round((optimal_cutpoint),
        digits = 3) * 100, "%"), x = 0.45, y = 0.2), hjust = 0,
        size = 6, show.legend = F) + geom_text(aes(label = odds_label,
    x = 0.45, y = 0), show.legend = F, hjust = 0, size = 6) +
    theme_bw() + theme(axis.text = et(color = "black", size = 12),
    axis.title = et(color = "black", size = 14), legend.text = et(size = 12),
    legend.title = et(size = 14), legend.spacing.y = unit(0.5,
        "cm"), legend.position = "none", panel.grid = eb(), strip.text = et(size = 18,
        color = "#2dc46b", face = "bold"), strip.background = eb(),
    panel.border = eb(), panel.spacing.y = unit(20, "mm"), axis.line.x = el(color = "black")) +
    geom_vline(xintercept = -0.05) + geom_hline(yintercept = -0.05) +
    geom_table_npc(data = ecoc_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterococcus") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.)), label = list(ecoc_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterococcus") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.))), npcx = 0.15, npcy = 0.6,
        hjust = 0, vjust = 1, table.rownames = TRUE, table.theme = ttheme_minimal(base_size = 16,
            core = list(bg_params = list(col = "#2dc46b"), fg_params = list(col = "#2dc46b")),
            colhead = list(fg_params = list(col = "#2dc46b",
                fontface = "bold")), rowhead = list(fg_params = list(col = "#2dc46b",
                fontface = "bold")))) + scale_x_continuous(expand = expansion(add = c(0.001,
    0.05))) + scale_y_continuous(expand = expansion(add = c(0.001,
    0.05))) + facet_wrap(~subgroup2, ncol = 1, scales = "fixed") +
    ylab("True Positive Rate\n") + xlab("\nFalse Positive Rate") +
    scale_color_manual(values = c("#2dc46b")) + guides(color = guide_legend("Groups",
    byrow = T, override.aes = list(size = 5)))

fig_5b

Figure 5D: Enterococcus Infection and ROC

ebac_infx_confusion <- expan_infx_stats %>%
    filter(metric %in% c("ebac_infx_ebac_below_cutpoint", "ebac_infx_ebac_above_cutpoint",
        "ebac_noinfx_ebac_below_cutpoint", "ebac_noinfx_ebac_above_cutpoint")) %>%
    select(metric, count) %>%
    pivot_wider(names_from = metric, values_from = count)

ebac_infx_confusion_cnfs <- data.frame(Enterobacterales = c("Infection",
    "No Infection"), Expansion = c(ebac_infx_confusion$ebac_infx_ebac_above_cutpoint,
    ebac_infx_confusion$ebac_noinfx_ebac_above_cutpoint), `No Expansion` = c(ebac_infx_confusion$ebac_infx_ebac_below_cutpoint,
    ebac_infx_confusion$ebac_noinfx_ebac_below_cutpoint))

ebac_infx_confusion_cnfs <- ebac_infx_confusion_cnfs %>%
    mutate(sensitivity = round(ebac_infx_confusion_cnfs[1, 2]/(ebac_infx_confusion_cnfs[1,
        2] + ebac_infx_confusion_cnfs[1, 3]), 3), specificity = round(ebac_infx_confusion_cnfs[2,
        3]/(ebac_infx_confusion_cnfs[2, 3] + ebac_infx_confusion_cnfs[2,
        2]), 3), odds_ratio = round((ebac_infx_confusion_cnfs[1,
        2]/ebac_infx_confusion_cnfs[1, 3])/(ebac_infx_confusion_cnfs[2,
        2]/ebac_infx_confusion_cnfs[2, 3]), 3))


fig_5d <- test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    ungroup() %>%
    unnest(roc_curve) %>%
    arrange(desc(AUC)) %>%
    select(-boot) %>%
    mutate(auc_label = paste0("AUC = ", formatC(round(AUC, 3),
        digits = 2, format = "f")), auc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
        ", ", formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
            AUC, alpha = 0.05)[3][2, ], 2)), digits = 2, format = "f"),
        "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(auc_label, " [", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            AUC, alpha = 0.05)[3][1, ], 2)), digits = 2, format = "f"),
            ", ", formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
                AUC, alpha = 0.05)[3][2, ], 2)), digits = 2,
                format = "f"), "]")), acc_label = paste0("Accuracy = ",
        formatC(round(acc, 3) * 100, digits = 0, format = "f"),
        "%"), acc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0, format = "f"),
        "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0, format = "f"),
        "%"), "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
            format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
            format = "f"), "%"), "]")), sens_label = paste0("Sensitivity = ",
        formatC(round(sensitivity, 3) * 100, digits = 0, format = "f"),
        "%"), sens_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]")), spec_label = paste0("Specificity = ",
        formatC(round(specificity, 3) * 100, digits = 0, format = "f"),
        "%"), spec_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"))) %>%
    filter(grepl(pattern = "rel_abundance", x = Input)) %>%
    mutate(subgroup = str_to_title(subgroup), subgroup = gsub(pattern = "_",
        replacement = " ", x = subgroup), subgroup = gsub(pattern = "rel abundance",
        replacement = "Relative Abundance (%)", x = subgroup),
        subgroup2 = ifelse(grepl(x = subgroup, pattern = "enterococcus",
            ignore.case = TRUE), "Enterococcus", "Enterobacterales"),
        subgroup2 = factor(subgroup2, levels = c("Enterococcus",
            "Enterobacterales")), odds_label = paste0("OR = ",
            ebac_infx_confusion_cnfs$odds_ratio[1])) %>%
    filter(subgroup2 == "Enterobacterales") %>%
    ggplot(aes(x = fpr, y = tpr, color = subgroup2)) + geom_line(size = 1.2) +
    geom_text(aes(label = auc_label, x = 0.45, y = 0.16), show.legend = F,
        hjust = 0, size = 6) + geom_text(aes(label = acc_label,
    x = 0.45, y = 0.12), show.legend = F, hjust = 0, size = 6) +
    geom_text(aes(label = spec_label, x = 0.45, y = 0.08), show.legend = F,
        hjust = 0, size = 6) + geom_text(aes(label = sens_label,
    x = 0.45, y = 0.04), show.legend = F, hjust = 0, size = 6) +
    geom_text(aes(label = paste0("Cutpoint = ", round((optimal_cutpoint),
        digits = 3) * 100, "%"), x = 0.45, y = 0.2), hjust = 0,
        size = 6, show.legend = F) + geom_text(aes(label = odds_label,
    x = 0.45, y = 0), show.legend = F, hjust = 0, size = 6) +
    theme_bw() + theme(axis.text = et(color = "black", size = 12),
    axis.title = et(color = "black", size = 14), legend.text = et(size = 12),
    legend.title = et(size = 14), legend.spacing.y = unit(0.5,
        "cm"), legend.position = "none", panel.grid = eb(), strip.text = et(size = 18,
        color = "red", face = "bold"), strip.background = eb(),
    panel.border = eb(), panel.spacing.y = unit(20, "mm"), axis.line.x = el(color = "black")) +
    geom_vline(xintercept = -0.05) + geom_hline(yintercept = -0.05) +
    geom_table_npc(data = ebac_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterobacterales") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.)), label = list(ebac_infx_confusion_cnfs %>%
        column_to_rownames(var = "Enterobacterales") %>%
        select(Expansion, `No Expansion` = No.Expansion) %>%
        mutate(Total = rowSums(.))), npcx = 0.15, npcy = 0.5,
        hjust = 0, vjust = 1, table.rownames = TRUE, table.theme = ttheme_minimal(base_size = 16,
            core = list(bg_params = list(col = "red"), fg_params = list(col = "red")),
            colhead = list(fg_params = list(col = "red", fontface = "bold")),
            rowhead = list(fg_params = list(col = "red", fontface = "bold")))) +
    scale_x_continuous(expand = expansion(add = c(0.001, 0.05))) +
    scale_y_continuous(expand = expansion(add = c(0.001, 0.05))) +
    facet_wrap(~subgroup2, ncol = 1, scales = "fixed") + ylab("True Positive Rate\n") +
    xlab("\nFalse Positive Rate") + scale_color_manual(values = c("red")) +
    guides(color = guide_legend("Groups", byrow = T, override.aes = list(size = 5)))

fig_5d

Enterococcus Expansion: Volcano Analysis

# Heatmap compounds and their categories
heatmap_lookup <- read.csv("./Data/qual_heatmap_lookup.csv",
    stringsAsFactors = FALSE)

# Build heatmap compound list
heatmap_cmpds <- metab_qual_anon %>%
    mutate(compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_lookup$compound) %>%
    distinct(compound) %>%
    drop_na()

qual_log2fc_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue != 0)) %>%
    summarise(log2fc_val = log((mean(mvalue[enterococcus_expansion ==
        "0"], na.rm = T)/mean(mvalue[enterococcus_expansion ==
        "1"], na.rm = T)), base = 2))  # 0 = No Expansion, 1 = Expansion


qual_pval_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue != 0)) %>%
    rstatix::wilcox_test(mvalue ~ enterococcus_expansion) %>%
    rstatix::adjust_pvalue(method = "BH") %>%
    rstatix::add_significance("p.adj")

qual_tot_ecoc_expan <- left_join(qual_log2fc_ecoc_expan, qual_pval_ecoc_expan) %>%
    column_to_rownames(var = "compound")

# volcano label color
ecoc_expan_volcano_labcol <- qual_tot_ecoc_expan %>%
    filter(p.adj <= 0.05 & abs(log2fc_val) >= 0.75) %>%
    mutate(color = ifelse(log2fc_val < 0, "#389458", "gray47"))

# # Volcano Plot (adjusted) set.seed(456)
# volcano_adj_ecoc_expan <-
# EnhancedVolcano(qual_tot_ecoc_expan, lab =
# rownames(qual_tot_ecoc_expan), x = 'log2fc_val', y =
# 'p.adj', title = NULL, pCutoff = 0.05, FCcutoff = 0.75,
# pointSize = 6, labSize = 8, axisLabSize = 32, labCol =
# ecoc_expan_volcano_labcol$color, caption = NULL, colAlpha
# = 0.65, col = c('gray75', c('#D4CA15', '#912777',
# '#1238E3')), xlim = c(-2.5, 4), ylim = c(0, 8),
# legendPosition = 'none', legendLabels =
# c(expression(p.adj > 0.05*';' ~ Log[2] ~ FC <
# '\u00B1'*0.75), expression(p.adj > 0.05*';' ~ Log[2] ~ FC
# >= '\u00B1'*0.75), expression(p.adj <= 0.05*';' ~ Log[2]
# ~ FC < '\u00B1'*0.75), expression(p.adj <= 0.05*';' ~
# Log[2] ~ FC >= '\u00B1'*0.75)), legendLabSize = 14,
# boxedLabels = T, drawConnectors = T, widthConnectors =
# 0.2, arrowheads = F, gridlines.minor = F, gridlines.major
# = F, max.overlaps = Inf ) + theme( axis.text = et(color =
# 'black'), legend.text = et(hjust = 0), plot.margin =
# unit(c(0, 0, 0, 0), 'cm') ) + labs(subtitle = NULL) +
# annotate('segment', x = 0.8, xend = 3, y = 7.95, yend =
# 7.95, arrow = arrow(), size = 2, color = 'gray67') +
# annotate('text', x = 1.65, y = 8.15, label = 'No
# Expansion', size = 9, color = 'gray67') +
# annotate('rect', xmin = 0.75, xmax = Inf, ymin =
# -log(0.05, base = 10), ymax = Inf, alpha = .1, fill =
# 'gray87') + annotate('segment', x = -0.8, xend = -2.5, y
# = 7.95, yend = 7.95, arrow = arrow(), size = 2, color =
# '#389458') + annotate('text', x = -1.55, y = 8.15, label
# = 'Expansion', size = 9, color = '#389458') +
# annotate('rect', xmin = -0.75, xmax = -Inf, ymin =
# -log(0.05, base = 10), ymax = Inf, alpha = .1, fill =
# '#389458') + guides(color = guide_legend(nrow = 4), shape
# = guide_legend(nrow = 4)) + scale_y_continuous(expand =
# expansion(add = c(0, 0.15))) volcano_adj_ecoc_expan
# ggsave(plot = volcano_adj_ecoc_expan, filename =
# './Results/Figure_5A.pdf', width = 24, height = 11)

Enterococcus Expansion: Distribution

# Using unadjusted p-values for down-selection of
# metabolites, show distribution of normalized peak area
boxplot_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0), enterococcus_expansion = ifelse(enterococcus_expansion ==
        1, "Expansion", "No Expansion")) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "No Expansion"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "Expansion"])) %>%
    filter(any(mvalue != 0)) %>%
    right_join(qual_tot_ecoc_expan %>%
        rownames_to_column(var = "compound") %>%
        mutate(abs_log2fc_val = abs(log2fc_val)) %>%
        filter(p <= 0.05, abs_log2fc_val >= 1))

if (nrow(boxplot_ecoc_expan) > 0) {
    print(ggpar(ggboxplot(boxplot_ecoc_expan, x = "enterococcus_expansion",
        y = "mvalue", color = "enterococcus_expansion", palette = c("#389458",
            "gray87"), ylab = "Normalized Peak Area", xlab = "",
        outlier.shape = NA), legend.title = "Enterococcus") +
        stat_compare_means(label.y.npc = 0.75) + facet_wrap(~compound,
        scales = "free_y") + geom_point(data = boxplot_ecoc_expan,
        aes(x = enterococcus_expansion, y = mvalue, color = enterococcus_expansion),
        position = position_jitter(width = 0.2), size = 2, alpha = 0.65))

    ggsave(filename = "./Results/enterococcus_expansion_boxplot.pdf",
        width = 24, height = 18)
} else {
    print("no significant observations")
}

taxUMAP Data Preparation

# Custom colors
pirate_colors <- c("#1A49BE", "#3A001E")
pirate_colors2 <- c("#3A001E", "#3A001E", "#3A001E", "#1A49BE")

t_metaphlan <- metaphlan_df %>%
    filter(sampleID %in% first_samps$sampleID | grepl(sampleID,
        pattern = "hd")) %>%
    mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant",
        "Healthy Donor")) %>%
    select(sampleID, taxid, db, pctseqs, Total) %>%
    group_by(sampleID, taxid, pctseqs) %>%
    slice(1) %>%
    ungroup() %>%
    filter(pctseqs >= 1e-04) %>%
    group_by(sampleID) %>%
    dplyr::add_count(taxid, name = "totalSp") %>%
    mutate(sampleID_count = length(unique(sampleID)), spPres = totalSp/sampleID_count) %>%
    filter(spPres >= 0.1) %>%
    select(-c(Total, sampleID_count, spPres, totalSp)) %>%
    group_by(sampleID) %>%
    mutate(pctseqs = pctseqs/sum(pctseqs))


t_metaphlan_mat <- t_metaphlan %>%
    distinct() %>%
    pivot_wider(names_from = "taxid", values_from = "pctseqs",
        values_fill = 0) %>%
    column_to_rownames(var = "sampleID") %>%
    select(-db)


# taxUMAP microbiota table
taxumap_microbiota <- t_metaphlan_mat %>%
    rownames_to_column(var = "index_column") %>%
    pivot_longer(-index_column, names_to = "taxid", values_to = "pctseq") %>%
    mutate(taxid = paste0("taxID", taxid)) %>%
    pivot_wider(names_from = "taxid", values_from = "pctseq")

write.csv(taxumap_microbiota, "./Results/microbiota_table.csv",
    row.names = FALSE)

# taxUMAP taxonomy table
taxumap_taxonomy <- t_metaphlan_mat %>%
    rownames_to_column(var = "index_column") %>%
    pivot_longer(-index_column, names_to = "taxid", values_to = "pctseq") %>%
    left_join(tax_lookup %>%
        mutate(taxid = as.character(taxid))) %>%
    mutate(taxid = paste0("taxID", taxid)) %>%
    distinct(Kingdom, Phylum, Class, Order, Family, Genus, Species,
        taxid) %>%
    transmute(OTU = taxid, Kingdom, Phylum = if_else(Phylum ==
        "", "unclassified", Phylum), Class = if_else(Class ==
        "", "unclassified", Class), Order = if_else(Order ==
        "", "unclassified", Order), Family = if_else(Family ==
        "", "unclassified", Family), Genus = if_else(Genus ==
        "", "unclassified", Genus), Genus = gsub(" ", "_", Genus),
        Species = if_else(Species == "", "unclassified", Species),
        Species = gsub(" ", "_", Species))
write.csv(taxumap_taxonomy, "./Results/taxonomy.csv", row.names = FALSE)

TaxUMAP Analysis

import sys

print(sys.path)
## ['', '/Users/nick/miniconda3/bin', '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/reticulate/config', '/Users/nick/miniconda3/lib/python310.zip', '/Users/nick/miniconda3/lib/python3.10', '/Users/nick/miniconda3/lib/python3.10/lib-dynload', '/Users/nick/.local/lib/python3.10/site-packages', '/Users/nick/miniconda3/lib/python3.10/site-packages', '/Users/nick/Downloads/taxumap', '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/reticulate/python']
from taxumap.taxumap_base import Taxumap

# From file
tu = Taxumap(taxonomy='./Results/taxonomy.csv', microbiota_data='./Results/microbiota_table.csv',random_state=456)
## Phylum Class
## not monophyletic
## Class Order
## not monophyletic
## Order Family
## not monophyletic
## Family Genus
## not monophyletic
## post validate inputs main                Kingdom  ...                          Species
## ASV                     ...                                 
## taxID817      Bacteria  ...             Bacteroides_fragilis
## taxID818      Bacteria  ...     Bacteroides_thetaiotaomicron
## taxID820      Bacteria  ...            Bacteroides_uniformis
## taxID821      Bacteria  ...             Phocaeicola_vulgatus
## taxID823      Bacteria  ...       Parabacteroides_distasonis
## ...                ...  ...                              ...
## taxID28447    Bacteria  ...        Clavibacter_michiganensis
## taxID33968    Bacteria  ...  Leuconostoc_pseudomesenteroides
## taxID709323   Bacteria  ...         Fructobacillus_tropaeoli
## taxID1070421  Bacteria  ...            Periweissella_fabalis
## taxID2749962  Bacteria  ...         Lactococcus_paracarnosus
## 
## [672 rows x 7 columns]
# Transform the data (an inplace function)
tu.transform_self(neigh=55)
## Taxumap(agg_levels = ['Phylum', 'Family'], weights = [1, 1])

# Raw embedding dataframe
embedding_df = tu.df_embedding

Figure 1: Shotgun Metagenomics, Bile Acid Genes, taxUMAP and Diversity

#### Alpha Diversity ####
# Alpha diversity matrix: Inverse Simpson
alpha_invsim <- vegan::diversity(t_metaphlan_mat, index = "invsimpson") %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("InvSimpson" = ".")

# Alpha Diversity matrix: Shannon
alpha_shannon <- vegan::diversity(t_metaphlan_mat, index = "shannon") %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("Shannon" = ".")

# Alpha Diversity matrix: Observed ASVs
alpha_richness <- vegan::specnumber(t_metaphlan_mat) %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("Richness" = ".")

#### taxUMAP #####
# Find most abundant taxa per sample and plot just that
top_tax <- t_metaphlan %>%
  group_by(sampleID) %>%
  slice_max(pctseqs, n = 1) %>%
  left_join(tax_lookup) %>% 
  mutate(across(everything(), ~ifelse(.=="", NA, as.character(.)))) %>% 
  replace_na(list(Species="unclassified",
                  Genus="unclassified",
                  Family="unclassified",
                  Order="unclassified",
                  Class="unclassified",
                  Phylum="unclassified")) %>%
  mutate(Genus=ifelse(Genus=="unclassified",
                      paste(Family,Genus,sep="\n"),
                      as.character(Genus)),
         pctseqs = as.numeric(pctseqs))


metaphlan_df2 <- t_metaphlan %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  left_join(tax_lookup) %>% 
  drop_na(taxid) %>% 
  arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
  mutate(Genus = paste0(Phylum,"-",Order,"-", Family, "-",Genus)) %>% 
  left_join(alpha_shannon) %>%
  group_by(sampleID) %>% 
  arrange(Genus) %>% 
  mutate(cum.pct = cumsum(pctseqs), 
         y.text = (cum.pct + c(0, cum.pct[-length(cum.pct)]))/2) %>% 
  ungroup() %>%
  dplyr::select(-cum.pct)

metaphlan_df_sumry <- 
  metaphlan_df2 %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  ungroup() %>% 
  mutate(healthy_min_shannon = min(Shannon[db == "Healthy Donor"]),
         diversity_group = case_when(
           db == "Healthy Donor" ~ "Healthy Donor",
           Shannon >= healthy_min_shannon ~ "High Diversity",
           TRUE ~ "TBD"
         ),
         lt_med_shannon = median(Shannon[diversity_group == "TBD"], na.rm = TRUE),
         diversity_group = case_when(
           diversity_group %in% c("Healthy Donor", "High Diversity") ~ diversity_group,
           Shannon >= lt_med_shannon ~ "Medium Diversity",
           TRUE ~ "Low Diversity"
         ),
         diversity_group = factor(
           diversity_group,
           levels = c(
             "Low Diversity",
             "Medium Diversity",
             "High Diversity",
             "Healthy Donor"
           )
         ),
         diversity_group_abv = gsub(pattern = " Diversity",
                                    replacement = "",
                                    x = diversity_group),
         diversity_group_abv = factor(diversity_group_abv,
                                      levels = c("Low", "Medium", "High", "Healthy Donor"))) %>% 
  arrange(Genus)


tax_umap_mat_plot <- py$embedding_df %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(t_metaphlan %>% 
              distinct(sampleID, db)) %>%
  left_join(top_tax) %>% 
  ggplot(aes(x = taxumap1, y = taxumap2, color = Genus, shape = db, size = pctseqs*100, alpha = db))+
  geom_point(fill = "black")+
  theme_bw() +
  theme(panel.grid = eb(),
        axis.title = et(color = "black", size = 14),
        axis.text = et(color = "black", size = 12),
        axis.line = el(color = "black"),
        # legend.title = et(color = "black", size = 14),
        # legend.text = et(color = "black", size = 12),
        legend.position = "right"
        ) + 
  xlab("taxUMAP1") +
  ylab("taxUMAP2") +
  guides(
    shape = guide_legend(
      title = "Cohort",
      override.aes = list(size = 4),
      order = 1,
      ncol = 1,
      title.position = "top"
    ),
    size = guide_legend(
      title = "Relative Abundance (%)",
      order = 2,
      ncol = 1,
      title.position = "top"
    ),
    color = "none",
    fill = "none",
    alpha = "none"
  )+
  scale_color_manual(values = metaphlan_pal)+
  scale_shape_manual(values = c(24, 16))+
  scale_alpha_manual(values = c(1, 0.45))

tax_umap_mat_plot

ggsave(plot = tax_umap_mat_plot, 
            filename = "./Results/Figure_1B.pdf",
            height = 8, width = 8)

#### Alpha Diversity Stats #####
# Obtain stats for alpha diversity
diversity_comps <- list(
  c("Healthy Donor", "High"),
  c("High", "Medium"),
  c("High", "Low"),
  c("Medium", "Low")
  )

alpha_stats <- alpha_invsim %>% 
  left_join(alpha_shannon) %>% 
  left_join(alpha_richness) %>% 
  inner_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group_abv)
             ) %>% 
  pivot_longer(!c(sampleID, db, diversity_group_abv),names_to = "diversity_metric", values_to = "value") %>% 
  group_by(diversity_metric) %>% 
  rstatix::wilcox_test(value~diversity_group_abv,
                        comparisons = diversity_comps,
                       p.adjust.method = "BH",
                       alternative= "two.sided"
                       ) %>% 
  ungroup()

# Create dataframe for all phylogentic levels of interest
phylo_rel_abd <- t_metaphlan %>% 
  left_join(tax_lookup) %>% 
  inner_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group_abv)
             ) %>% 
  mutate(Species = paste(Kingdom, Phylum, Class, Order, Family, Genus, Species, sep = "|")) %>% 
  filter(grepl(pattern = "Enterococcus|Enterobacterales|Bacteroidetes|Lachnospiraceae|Oscillospiraceae", x = Species)) %>% 
  mutate(organism = case_when(grepl(pattern = "Enterococcus", x = Species) ~ "Enterococcus",
                              grepl(pattern = "Enterobacterales", x = Species) ~ "Enterobacterales",
                              grepl(pattern = "Bacteroidetes", x = Species) ~ "Bacteroidetes",
                              grepl(pattern = "Lachnospiraceae", x = Species) ~ "Lachnospiraceae",
                              grepl(pattern = "Oscillospiraceae", x = Species) ~ "Oscillospiraceae")) %>% 
  select(sampleID, db, diversity_group_abv, organism, pctseqs) %>% 
  group_by(sampleID, db, diversity_group_abv, organism) %>% 
  summarise(pctseqs = sum(pctseqs)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = organism, values_from = pctseqs, values_fill = 0) %>% 
  pivot_longer(!c(sampleID, db, diversity_group_abv), names_to = "organism", values_to = "pctseqs")

# Obtain stats for all phylogentic levels of interest
rel_abd_alpha_stats <- phylo_rel_abd %>%  
  group_by(organism) %>% 
  rstatix::wilcox_test(pctseqs~diversity_group_abv) %>% 
  bind_rows(alpha_stats) %>% 
  rstatix::adjust_pvalue(method = "BH") %>% 
  mutate(p.adj = ifelse(p.adj < 0.001, 0.001, round(p.adj, 3)))

write.csv(rel_abd_alpha_stats, "./Results/Figure_1_Statistics.csv", row.names = FALSE)

symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, Inf), symbols = c("****", "***", "**", "*", "ns"))

diversity_group_colors <- c("#3A001E", "#8A0246", "#C20463", "#1A49BE")

# #### Plot Inverse Simpson ####
# 
# 
# set.seed(456)
# gg_alpha_invsim <- alpha_invsim %>% 
#   inner_join(t_metaphlan %>% 
#                distinct(sampleID, db) %>% 
#                select(sampleID, db)
#              ) %>% 
#   inner_join(metaphlan_df %>%
#                left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
#                distinct(sampleID, db, diversity_group, diversity_group_abv)
#              ) %>% 
#   mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
#   ggplot(., aes(x = diversity_group_abv, 
#                 y = InvSimpson, 
#                 color = diversity_group_abv, 
#                 fill = db)) +
#   geom_boxplot(outlier.colour = NA,
#                alpha = 0.35) +
#   geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
#   stat_compare_means(comparisons = diversity_comps,
#                      tip.length = 0.01,
#                      step.increase = 0.075,
#                      symnum.args = symnum.args,
#                      method.args = list(alternative = "two.sided",
#                                         exact = FALSE)
#                      ) +
#   theme_bw() +
#   theme(
#     panel.grid = eb(),
#     axis.title.y = et(size = 14, color = "black"),
#     axis.title.x = eb(),
#     axis.text = et(size = 12, color = "black"),
#     plot.margin = margin(t = 5,  # Top margin
#                          r = 5,  # Right margin
#                          b = 5,  # Bottom margin
#                          l = 5), # Left margin
#     panel.border = eb(),
#     axis.line = el(color = 'black')
#   ) +
#   ylab("Alpha Diversity\n(Inverse Simpson)") +
#   scale_fill_manual(values = rev(pirate_colors)) +
#   scale_color_manual(values = diversity_group_colors) +
#   guides(fill = guide_legend("Cohort"),
#          color = guide_legend("Diversity Group",
#                               override.aes = aes(label = "")))+
#   scale_y_continuous(breaks = seq(0,45,5),
#                      expand = expansion(mult = c(0.01, 0.1))) +
#   coord_cartesian(xlim = c(1.1,3.9))
# 
#   
# gg_alpha_invsim
# 
# pdf(file = "./Results/Figure_1C.pdf", height = 6, width = 7)
# gg_alpha_invsim
# invisible(dev.off())
# 
# #### Plot Shannon Diversity ####
# set.seed(456)
# gg_alpha_shannon <- alpha_shannon %>% 
#   inner_join(t_metaphlan %>% 
#                distinct(sampleID, db) %>% 
#                select(sampleID, db)
#              ) %>% 
#   inner_join(metaphlan_df %>%
#                left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
#                distinct(sampleID, db, diversity_group, diversity_group_abv)
#              ) %>% 
#   mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
#   ggplot(., aes(x = diversity_group_abv, 
#                 y = Shannon, 
#                 color = diversity_group_abv, 
#                 fill = db)) +
#   geom_boxplot(outlier.colour = NA,
#                alpha = 0.35) +
#   geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
#   stat_compare_means(comparisons = diversity_comps,
#                      tip.length = 0.01,
#                      step.increase = 0.075,
#                      symnum.args = symnum.args,
#                      method.args = list(alternative = "two.sided",
#                                         exact = FALSE)
#                      ) + 
#   theme_bw() +
#   theme(
#     panel.grid = eb(),
#     axis.title.y = et(size = 14, color = "black"),
#     axis.title.x = eb(),
#     axis.text = et(size = 12, color = "black"),
#     plot.margin = margin(t = 5,  # Top margin
#                          r = 5,  # Right margin
#                          b = 5,  # Bottom margin
#                          l = 5), # Left margin
#     panel.border = eb(),
#     axis.line = el(color = 'black')
#   ) +
#   ylab("Alpha Diversity\n(Shannon)") +
#   scale_fill_manual(values = rev(pirate_colors)) +
#   scale_color_manual(values = diversity_group_colors) +
#   guides(fill = guide_legend("Cohort"),
#          color = guide_legend("Diversity Group",
#                               override.aes = aes(label = "")))+
#   scale_y_continuous(breaks = seq(0,6.5,1),
#                      expand = expansion(mult = c(0.01, 0.1))) +
#   coord_cartesian(xlim = c(1.1,3.9))
#   
#   
# gg_alpha_shannon
# 
# pdf(file = "./Results/Figure_1D.pdf", height = 6, width = 7)
# gg_alpha_shannon
# invisible(dev.off())
# 
# #### Plot Species Richness ####
# set.seed(456)
# gg_alpha_richness <- alpha_richness %>% 
#   inner_join(t_metaphlan %>% 
#                distinct(sampleID, db) %>% 
#                select(sampleID, db)
#              ) %>% 
#   inner_join(metaphlan_df %>%
#                left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
#                distinct(sampleID, db, diversity_group, diversity_group_abv)
#              ) %>% 
#   mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
#   ggplot(., aes(x = diversity_group_abv, 
#                 y = Richness, 
#                 color = diversity_group_abv, 
#                 fill = db)) +
#   geom_boxplot(outlier.colour = NA,
#                alpha = 0.35) +
#   geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
#   stat_compare_means(comparisons = diversity_comps,
#                      tip.length = 0.01,
#                      label.y = c(145, 160, 150, 110),
#                      symnum.args = symnum.args,
#                      method.args = list(alternative = "two.sided",
#                                         exact = FALSE)
#                      ) +
#   theme_bw() +
#   theme(
#     panel.grid = eb(),
#     axis.title.y = et(size = 14, color = "black"),
#     axis.title.x = eb(),
#     axis.text = et(size = 12, color = "black"),
#     plot.margin = margin(t = 5,  # Top margin
#                          r = 5,  # Right margin
#                          b = 5,  # Bottom margin
#                          l = 5), # Left margin
#     panel.border = eb(),
#     axis.line = el(color = 'black')
#   ) +
#   ylab("Alpha Diversity\n(Richness)") +
#   scale_fill_manual(values = rev(pirate_colors)) +
#   scale_color_manual(values = diversity_group_colors) +
#   guides(fill = guide_legend("Cohort"),
#          color = guide_legend("Diversity Group",
#                               override.aes = aes(label = "")))+
#   scale_y_continuous(breaks = seq(0,180,50),
#                      expand = expansion(mult = c(0.01, 0.035))) +
#   coord_cartesian(xlim = c(1.1,3.9))
#   
#   
# gg_alpha_richness
# 
# pdf(file = "./Results/Figure_1E.pdf", height = 6, width = 7)
# gg_alpha_richness
# invisible(dev.off())

#### Plot Lachnospiraceae ####
set.seed(456)
gg_lach_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Lachnospiraceae") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.65, 0.71, 0.76, 0.82)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Lachnospiraceae")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_lach_rel_abd

# pdf(file = "./Results/Figure_1F.pdf", height = 6, width = 7)
# gg_lach_rel_abd
# invisible(dev.off())

#### Plot Bacteroidetes ####
set.seed(456)
gg_bact_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Bacteroidetes") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.65, 0.98, 0.92, 1.02)
                     ) + 
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Bacteroidetes")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_bact_rel_abd

# pdf(file = "./Results/Figure_1I.pdf", height = 6, width = 7)
# gg_bact_rel_abd
# invisible(dev.off())

#### Plot Oscillospiraceae ####
set.seed(456)
gg_oscl_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Oscillospiraceae") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.33, 0.52, 0.59, 0.66)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Oscillospiraceae")), paste("MetaPhlAn4 Relative Abundance"))) +
   scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_oscl_rel_abd

# pdf(file = "./Results/Figure_1J.pdf", height = 6, width = 7)
# gg_oscl_rel_abd
# invisible(dev.off())

#### Plot Enterococcus ####
set.seed(456)
gg_ecoc_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Enterococcus") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                    
                     label.y = c(0.20, 0.75, 0.985, 1.05)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Enterococcus")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_ecoc_rel_abd

# pdf(file = "./Results/Figure_1G.pdf", height = 6, width = 7)
# gg_ecoc_rel_abd
# invisible(dev.off())

#### Plot Enterobacterales #####
set.seed(456)
gg_ebac_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Enterobacterales") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.20, 0.75, 0.985, 1.05)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Enterobacterales")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     limits = c(-0.01,1.1),
                     expand = expansion(mult = c(0.01, 0.1)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
  
  
# gg_ebac_rel_abd

# pdf(file = "./Results/Figure_1H.pdf", height = 6, width = 7)
# gg_ebac_rel_abd
# invisible(dev.off())

#### Plot Metaphlan Relative Abundance ####
# Figure 1A-MetaPhlAn4 Taxonomy
# Load legend
tax_legend <- magick::image_read_pdf("./Data/legend.v2.pdf")

gg_tax_legend <- cowplot::ggdraw() + cowplot::draw_image(tax_legend)

metaphlan_pal2 <- getRdpPal(metaphlan_df2)

gg_metaphlan <- metaphlan_df2 %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
  ungroup() %>% 
  mutate(Genus = factor(Genus, levels = unique(Genus))) %>% 
  group_by(sampleID) %>% 
  arrange(Genus) %>% 
  ggplot(aes(x=reorder(sampleID, Shannon),y=pctseqs)) +
  geom_bar(stat="identity",aes(fill=Genus), width = 0.9) +
  scale_fill_manual(values = metaphlan_pal2) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x=eb(),
        axis.ticks.x=eb(),
        strip.text.x= et(angle=0,size=14),
        strip.background = eb(),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 5,
                             r = 5, 
                             b = 0, 
                             l = 5)) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005)),
                     labels = scales::percent_format(accuracy = 1)) +
  ylab("MetaPhlAn4 Relative Abundance") +
  xlab("")


# Color facets
gg_metaphlan_grob <- ggplot_gtable(ggplot_build(gg_metaphlan))
strip_both <- which(grepl('strip-', gg_metaphlan_grob$layout$name))
fills <- diversity_group_colors
k <- 1
for (i in strip_both) {
  l <- which(grepl('titleGrob', gg_metaphlan_grob$grobs[[i]]$grobs[[1]]$childrenOrder))
  gg_metaphlan_grob$grobs[[i]]$grobs[[1]]$children[[l]]$children[[1]]$gp$col <- fills[k]
  k <- k+1
}

gg_shannon <- metaphlan_df_sumry %>% 
  ggplot(aes(x=reorder(sampleID, Shannon), y = Shannon)) +
  geom_bar(stat="identity",aes(fill=diversity_group_abv), width = 0.9) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = eb(),
        axis.title.x = eb(),
        axis.ticks.x = eb(),
        strip.text = eb(),
        strip.background = er(fill = "white"),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 0,
                             r = 5, 
                             b = 0, 
                             l = 5),
        panel.grid = eb()) +
  scale_fill_manual(values = diversity_group_colors) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")

gg_metaphlan_shannon <-
plot_grid(
  gg_metaphlan_grob,
  gg_shannon,
  axis = "lb",
  align = "hv",
  nrow = 2,
  rel_heights = c(1, 0.15)
)

pdf(file = "./Results/Figure_1A.pdf", width = 12.25, height = 8)
gg_metaphlan_shannon 
invisible(dev.off())

#### Combine all into a single figure 1 Start ####
alpha_org_plot <- plot_grid(
  # gg_alpha_invsim + theme(legend.position = "none", 
  #                         axis.text.x = eb(), 
  #                         axis.title.x = eb(),
  #                         plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  # gg_alpha_shannon + theme(legend.position = "none", 
  #                          axis.text.x = eb(), 
  #                          axis.title.x = eb(),
  #                          plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  # gg_alpha_richness+ theme(legend.position = "none", 
  #                          axis.text.x = eb(),
  #                          axis.title.x = eb(),
  #                          plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  gg_lach_rel_abd + theme(legend.position = "none",
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85),
                          axis.title.x = eb(),
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_bact_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_oscl_rel_abd + theme(axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm"),
                          legend.position = "right"),
  gg_ecoc_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_ebac_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  tax_umap_mat_plot + 
    theme(plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  nrow = 2,
  axis = "lb",
  align = "v",
  rel_widths = c(1, 1, 1.3),
  rel_heights = c(1,1)
)

# This is useful to figure out the general layout of the plots
gs <- lapply(1:3, function(ii)
  grobTree(rectGrob(gp=gpar(fill=ii, alpha=0.5)), textGrob(ii)))

# grid.arrange(grobs=gs, ncol=4,
#                top="top label", bottom="bottom\nlabel",
#                left="left label", right="right label")

lay <- rbind(c(1,1,1,1,1,NA),
             c(1,1,1,1,1,2),
             c(1,1,1,1,1,NA),
             c(3,3,3,3,3,3),
             c(3,3,3,3,3,3),
             c(3,3,3,3,3,3),
             c(3,3,3,3,3,3))

# grid.arrange(grobs = gs, layout_matrix = lay)

pdf(file = "./Results/Figure_1.pdf", height = 16, width = 20)

grid.arrange(
  gg_metaphlan_shannon,                        # 1
  gg_tax_legend,                               # 2
  # tax_umap_mat_plot + 
  #   theme(
  #     legend.text = et(size = 10),
  #     legend.title = et(size = 12),
  #     plot.margin = margin(
  #       t = 5,  # Top margin
  #       r = 0,  # Right margin
  #       b = 75, # Bottom margin
  #       l = 5   # Left margin
  #     )
    # ),                                      # 2
  alpha_org_plot,                           # 3
  layout_matrix = lay
)

invisible(dev.off())

grid.arrange(
  gg_metaphlan_shannon,                        # 1
  gg_tax_legend,                               # 2
  # tax_umap_mat_plot + 
  #   theme(
  #     legend.text = et(size = 10),
  #     legend.title = et(size = 12),
  #     plot.margin = margin(
  #       t = 5,  # Top margin
  #       r = 0,  # Right margin
  #       b = 75, # Bottom margin
  #       l = 5   # Left margin
  #     )
    # ),                                      # 2
  alpha_org_plot,                           # 3
  layout_matrix = lay
)

#### Combine all into a single figure 1 End ####

Figure 4: Pathobiont Abundance and Bile Acid Genes

gg_metaphlan_pathos <- metaphlan_df2 %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
  mutate(Species = paste(Kingdom, Phylum, Class, Order, Family, Genus, Species, sep = "-")) %>% 
  ungroup() %>% 
  filter(diversity_group != "Healthy Donor") %>% 
  mutate(Species = case_when(grepl(x = Species, pattern = "Enterococcus") ~ "Enterococcus",
                             grepl(x = Species, pattern = "Enterobacterales") ~ "Enterobacterales",
                             TRUE ~ "Other")) %>% 
  group_by(sampleID, diversity_group, Shannon, Species) %>% 
  summarise(pctseqs = sum(pctseqs)) %>%
  mutate(pctseqs = ifelse(Species == "Other", 0, pctseqs)) %>% 
  ungroup() %>% 
  arrange(Species) %>% 
  mutate(Species = forcats::fct_relevel(Species)) %>%
  group_by(sampleID) %>%
  arrange(Species) %>% #distinct(sampleID)
  ggplot(aes(x=reorder(sampleID, Shannon),y=pctseqs)) +
  geom_bar(stat="identity",aes(fill=Species), width = 0.9) +
  scale_fill_manual("Pathobiont",
                    values = c("#FF0000", "#0C7A3A", "#00000000"),
                    breaks = c("Enterobacterales", "Enterococcus", "")) +
  # paletteer::scale_fill_paletteer_d(palette = "vapeplot::vaporwave"
  #                                     # "rcartocolor::Antique"
  #                                     # "MetBrewer::Renoir"
  #                                   ) +
  theme_bw() +
  theme(legend.position = "right",
        axis.text.x=eb(),
        axis.ticks.x=eb(),
        strip.text.x= et(angle=0,size=14),
        strip.background = eb(),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        panel.grid.minor = eb(),
        panel.grid.major.y = eb()) +
  facet_grid(. ~diversity_group, scales = "free_x")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005)),
                     labels = scales::percent_format(accuracy = 1)) +
  ylab("MetaPhlAn4 Relative Abundance") +
  xlab("")


# gg_metaphlan_pathos

#### Bile Acid Genes ####
ba_genes_wilcox_test <- ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  group_by(gene) %>% 
  wilcox_test(tpm ~ diversity_group) 
 
ba_genes_wilcox_effect <- ba_genes %>%
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  group_by(gene) %>% 
  wilcox_effsize(tpm ~ diversity_group)
 
gg_ba_genes <- ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  ungroup() %>% 
  ggplot(aes(x = reorder(sampleID, Shannon), y = tpm, fill = diversity_group)) +
  geom_col() +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 5,
                             r = 5, 
                             b = 0, 
                             l = 5)) +
  scale_fill_manual(values = diversity_group_colors) +
  facet_grid(gene~diversity_group, scales = "free_x")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005))) +
  ylab("Gene Abundance (TPM)") +
  xlab("")
 
# gg_ba_genes

pdf(file = "./Results/Metaphlan_Bile_Acid.pdf", height = 26, width = 20, onefile = FALSE)
gg.stack(gg_metaphlan,
         gg_ba_genes,
         heights = c(1, 8))
dev.off()
## quartz_off_screen 
##                 2
# Stats
gg_bile_gene_stats <-
ggstatsplot::grouped_ggbetweenstats(
  data = ba_genes %>%
    filter(grepl(patientID, pattern = "^lt")) %>% 
    mutate(gene = factor(
      gene,
      levels = c(
        "BaiA",
        "BaiA1",
        "BaiA2",
        "BaiB",
        "BaiCD",
        "BaiE",
        "BaiF",
        "BaiG",
        "BaiH",
        "BaiI",
        "BSH",
        "3aHSDH",
        "3bHSDH",
        "7aHSDH",
        "7bHSDH",
        "12abHSDH",
        "5AR",
        "5BR"
      )
    )) %>%
    filter(sampleID %in% metaphlan_df2$sampleID) %>%
    left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)),
  x = diversity_group,
  y = tpm,
  grouping.var = gene, 
  ggsignif.args  = list(textsize = 4, tip_length = 0.01),
  p.adjust.method  = "BH",
  type = "non-parametric",
  # pairwise.comparisons = FALSE,
  pairwise.display = "significant",
  results.subtitle = FALSE,
  ggplot.component = list(scale_fill_manual(values = diversity_group_colors),
                          scale_color_manual(values = diversity_group_colors)),
  plotgrid.args = list(nrow = 3),
  annotation.args = list(title = "Bile Acid Gene")
) #+
  # geom_signif(comparisons = list(c("Medium Diversity", "High Diversity"),
  #                                c("Medium Diversity", "Low Diversity")), 
  #             test = "wilcox.test")

pdf(file = "./Results/Bile_Acid_Stats.pdf", height = 20, width = 40, onefile = FALSE)
gg_bile_gene_stats
dev.off()
## quartz_off_screen 
##                 2
# Bile Genes Heatmap
gg_ba_genes_heatmap <- 
ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR")),
         gene = forcats::fct_rev(gene)) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  group_by(gene) %>% 
  mutate(tpm_zscore = scale(tpm, scale = TRUE, center = TRUE),
         tpm_log10 = log(tpm, base = 10),
         tpm_log10 = ifelse(is.infinite(tpm_log10), 0, tpm_log10)) %>%
  ggplot(aes(x = reorder(sampleID, Shannon), y = gene, fill = tpm_log10)) +
  geom_tile(stat = "identity", color = "gray") +
  theme_bw() +
  theme(
        panel.grid = eb(),
        plot.background = eb(),
        panel.border = er(colour = "black",
                          fill = NA,
                          linewidth = 0.5), 
        axis.title = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.line = el(color = "black")
  ) +
  facet_grid(. ~diversity_group, scales = "free_x")+
  ylab("Gene\n") +
  xlab("") +
  guides(fill = guide_legend(title = "TPM",
                             reverse = TRUE)) +
  scale_fill_gradient2(low = "white",
                       mid= "cyan1",
                       high = "#0c0970",
                       midpoint = 3.5,
                      breaks = seq(1, 7, by = 1),
                      # labels = c("1", "2", "3", "4", "5", "6", "7"),
                      labels = scales::label_math(),
                      limits = c(0, 7)
                    ) +
  # scale_fill_fermenter(palette = "YlGnBu", direction = 1) +
  # scale_x_discrete(expand=c(0,0))+
  scale_y_discrete(expand=c(0,0))

gg_ba_genes_bars <- 
ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  mutate(pres_abs = ifelse(tpm > 0, 1, 0)) %>% 
  group_by(sampleID, Shannon, diversity_group) %>% 
  summarise(pres_abs = sum(pres_abs)) %>% 
  # summarise(tot_pres_abs = sum(pres_abs)) %>% 
  # mutate(pct_pres = tot_pres_abs / length(unique(ba_genes$gene))) %>%
  ggplot(aes(x = reorder(sampleID, Shannon), y = pres_abs, fill = diversity_group)) +
  geom_col() +
  geom_hline(yintercept = 18, linetype = "dashed") +
  # annotate(geom = "text", x = 10, y = 18.5, label = "18 Genes Total") +
  theme_bw() +
  theme(
        panel.grid = eb(),
        plot.background = eb(),
        panel.border = er(colour = "black",
                          fill = NA,
                          linewidth = 0.5), 
        axis.title = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.line = el(color = "black"),
        legend.position = "none"
  ) +
  facet_grid(. ~diversity_group, scales = "free_x")+
  scale_fill_manual(values = diversity_group_colors) +
  ylab("Genes Present (Total) \n") +
  xlab("") +
  guides(fill = guide_legend(title = "Diversity Group",
                             reverse = TRUE)) +
  # scale_x_discrete(expand=c(0,0)) +
  scale_y_continuous(expand = c(0,0),
                     breaks = c(seq(0,15,5), 18),
                     limits = c(0,20),
                     labels = c("0", "5", "10", "15", "18 Total Genes"))

ba_genes_stats <- ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  mutate(pres_abs = ifelse(tpm > 0, 1, 0)) %>% 
  group_by(sampleID, Shannon, diversity_group) %>% 
  summarise(tot_pres_abs = sum(pres_abs)) %>% 
  mutate(pct_pres = tot_pres_abs / length(unique(ba_genes$gene))) %>% 
  ungroup() %>%
  kruskal_test(pct_pres ~ diversity_group)
  # wilcox_test(pct_pres ~ diversity_group, comparisons = list(
  #   c("High Diversity", "Medium Diversity"),
  #   c("High Diversity", "Low Diversity"),
  #   c("Medium Diversity", "Low Diversity")
  # ))

genes_stats_anno <- data.frame(diversity_group = c("Low Diversity",
                                                   "Medium Diversity",
                                                   "High Diversity"),
                               pct_pres = c(0.9, NA, NA),
                               label = c(paste0("Kruskal-Wallis, p = ", scales::scientific(ba_genes_stats$p)),
                                         NA,
                                         NA)) %>% 
  mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity",
                                                   "Medium Diversity",
                                                   "High Diversity")))

gg_ba_genes_box <-
ba_genes %>%
  filter(grepl(patientID, pattern = "^lt")) %>% 
  mutate(gene = factor(gene, levels = c("BaiA", "BaiA1", "BaiA2",
                                        "BaiB", "BaiCD", "BaiE",
                                        "BaiF", "BaiG", "BaiH", 
                                        "BaiI", "BSH",
                                        "3aHSDH", "3bHSDH",
                                        "7aHSDH", "7bHSDH",
                                        "12abHSDH",
                                        "5AR", "5BR"))) %>% 
  filter(sampleID %in% metaphlan_df2$sampleID) %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, Shannon, diversity_group)) %>% 
  mutate(pres_abs = ifelse(tpm > 0, 1, 0)) %>% 
  group_by(sampleID, Shannon, diversity_group) %>% 
  summarise(tot_pres_abs = sum(pres_abs)) %>% 
  mutate(pct_pres = tot_pres_abs / length(unique(ba_genes$gene))) %>% 
  ggplot(aes(x = diversity_group, y = pct_pres, fill = diversity_group)) +
  geom_violin(trim = TRUE, alpha = 0.85) +
  geom_boxplot(alpha = 0.5, fill = "white", width = 0.4) +
  geom_jitter(color = "black", fill = "white", alpha = 0.5, shape = 21, size = 1.5, position = position_jitter(width = 0.15, seed = 123)) +
  geom_text(data = genes_stats_anno, label = genes_stats_anno$label, nudge_x = -0.35) +
  theme_bw() +
  theme(
        panel.grid = eb(),
        plot.background = eb(),
        panel.border = er(colour = "black",
                          fill = NA,
                          linewidth = 0.5), 
        axis.title = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        axis.text.x = eb(),
        axis.ticks.x = eb(),
        strip.text.x = eb(),
        strip.text.y = et(angle = 0, size = 14, hjust = 0),
        strip.background = eb(),
        axis.line = el(color = "black"),
        legend.position = "right",
  ) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")+
  scale_fill_manual(values = diversity_group_colors) +
  ylab("Genes Present (%) \n") +
  xlab("") +
  guides(fill = guide_legend(title = "Diversity Group"),
         color = guide_legend(title = "Diversity Group")) +
  scale_x_discrete(expand = expansion(mult = c(2, 2))) +
  scale_y_continuous(breaks = seq(0, 1, 0.25),
                     expand = expansion(mult = c(0.01, 0.03)),
                     labels = c("0%", "25%", "50%", "75%", "100%")) 

# pdf(file = "./Results/Figure_5.pdf", height = 14, width = 20, onefile = FALSE)
# gg.stack(gg_metaphlan_family,
#          # gg_metaphlan,
#          gg_ba_genes_heatmap,
#          gg_ba_genes_bars,
#          gg_ba_genes_box,
#          heights = c(0.2, 0.5, 0.2, 0.2),
#          gap = 2)
# dev.off()

pdf(file = "./Results/Figure_4.pdf", height = 10, width = 16, onefile = FALSE)
(
    (gg_metaphlan_pathos + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork:: plot_spacer() /
    (gg_ba_genes_heatmap + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork::plot_spacer() / 
    (gg_ba_genes_bars + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) / 
    patchwork::plot_spacer() / 
    (gg_ba_genes_box + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt")))
  ) + 
  patchwork::plot_layout(guides = 'collect', 
                         heights = c(0.15, 
                                     -0.03, # Spacer
                                     0.6,
                                     -0.03, # Spacer
                                     0.3, 
                                     -0.03, # Spacer
                                     0.3))
invisible(dev.off())

{
    ((gg_metaphlan_pathos + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork:: plot_spacer() /
    (gg_ba_genes_heatmap + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) /
    patchwork::plot_spacer() / 
    (gg_ba_genes_bars + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt"))) / 
    patchwork::plot_spacer() / 
    (gg_ba_genes_box + theme(plot.margin = margin(t = 0, r = 5, b = -5, l = 5, "pt")))
  ) + 
  patchwork::plot_layout(guides = 'collect', 
                         heights = c(0.15, 
                                     -0.03, # Spacer
                                     0.6,
                                     -0.03, # Spacer
                                     0.3, 
                                     -0.03, # Spacer
                                     0.3))
}

Figure 2: Qualitative Metabolomics Heatmap

heatmap_data <- 
  t_metaphlan %>% 
  drop_na(taxid) %>% 
  select(sampleID) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant", "Healthy Donor")) %>%
  left_join(metab_qual_anon) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>% 
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  group_by(compound) %>% 
  mutate(median_val = ifelse(median(mvalue, na.rm = TRUE) == 0, min(mvalue[mvalue > 0], na.rm = TRUE)/10, median(mvalue, na.rm = TRUE)),
         heatmap_val = ifelse(log(mvalue/ median_val, base = 2) == -Inf, 0, log(mvalue/ median_val, base = 2))) %>% 
  ungroup() %>% 
  select(-c(mvalue, median_val)) %>% 
  group_by(sampleID, compound) %>% 
  slice_max(heatmap_val, with_ties = F, n = 1) %>% 
  ungroup() %>%
  select(-db) %>% 
  left_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>%
               distinct(sampleID, db, diversity_group_abv), by = "sampleID") %>% 
  group_by(sampleID, compound) %>% 
  slice_max(heatmap_val, with_ties = F, n = 1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup()

# Stats
heatmap_pvals <-
  heatmap_data %>%
  filter(diversity_group_abv != "Healthy Donor") %>% 
  drop_na(compound) %>%
  group_by(compound) %>%
  rstatix::kruskal_test(heatmap_val~diversity_group_abv) %>%
  rstatix::adjust_pvalue(method = "BH") %>%
  select(compound, statistic, p, p.adj) 

heatmap_labels <- 
  heatmap_data %>% 
  mutate(compound = str_to_title(compound)) %>% 
  left_join(heatmap_lookup) %>%
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  arrange(class, subclass, p) %>%
  arrange(class, subclass) %>% 
  pivot_wider(c(diversity_group_abv, sampleID, db, Shannon), names_from = "compound", values_from = "heatmap_val", values_fn = mean) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup() %>% 
  distinct(sampleID, .keep_all = TRUE) %>% 
  select(diversity_group_abv) %>% 
  mutate(
    diversity_group_abv = as.character(diversity_group_abv),
    diversity_group_abv = ifelse(
      grepl(pattern = "Healthy", as.character(diversity_group_abv)),
      diversity_group_abv,
      paste(diversity_group_abv, "Diversity")
    ),
    diversity_group_abv = factor(diversity_group_abv, levels = c("Low Diversity", "Medium Diversity",
                                                                 "High Diversity", "Healthy Donor"))
  )
  
# Build heatmap compound order (row order) and compound class (row slice)
heatmap_order <- 
  heatmap_data %>% 
  ungroup() %>% 
  filter(compound %in% heatmap_cmpds$compound) %>% 
  distinct(compound) %>% 
  drop_na() %>% 
  left_join(heatmap_lookup) %>% 
  mutate(class = case_when(class %in% c("Dicarboxylic Acid") ~ "Fatty Acid",
                           class %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                           TRUE ~ class),
         subclass = case_when(subclass %in% c("Branched-Chain Fatty Acid",
                                              "Aminated Fatty Acid",
                                              "Long-Chain Fatty Acid",
                                              "Dicarboxylic Acid") ~ "Other Fatty Acid",
                              subclass == "Indole" ~ "Tryptophan Metabs.",
                              subclass %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                              TRUE ~ subclass
                              )) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  ungroup() %>% 
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      # "Dicarboxylic Acid",          
      "Amino Acid",                   # 2
      "Bile Acid",                    # 3
      "Indole",                       # 4
      "Add. Cmpds.",                  # 5
      # "Phenolic Aromatic",          
      # "Kynurine Pathway",             
      "Vitamin"                       # 6
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Other Fatty Acid",             # 2
      "Amino Acid",                   # 3
      # "Aminated Fatty Acid",        
      # "Long-Chain Fatty Acid",      
      # "Dicarboxylic Acid",          
      "Primary Bile Acid",            # 4
      "Secondary Bile Acid",          # 5
      "Conjugated Bile Acid",         # 6
      "Tryptophan Metabs.",           # 7
      # "Phenolic Aromatic",            # 
      # "Kynurine Pathway",             # 
      "Vitamin",                      # 8
      "Add. Cmpds."                   # 9
    )
  )) %>% 
  arrange(class, subclass, p, compound) %>%
  select(class, subclass, p, compound)
  
# Build heatmap patient order following same order as gg_metaphlan plot
heatmap_column_order <- heatmap_data %>% 
  group_by(patientID) %>% 
  dplyr::slice(1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>%  
  select(patientID) %>% 
  distinct(patientID) %>% 
  pull(patientID)

# P-value legend color
pvalue_col_fun = colorRamp2(c(0, 0.045), c("#75C236", "#E3E4E6"))

# Create heatmap for adjusted p-values
pvalue_adj <-
heatmap_pvals %>% 
  group_by(compound, p.adj, p) %>% 
  dplyr::slice(1) %>% 
  ungroup() %>% 
  left_join(heatmap_lookup) %>%
  mutate(class = case_when(class %in% c("Dicarboxylic Acid") ~ "Fatty Acid",
                           class %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                           TRUE ~ class),
         subclass = case_when(subclass %in% c("Branched-Chain Fatty Acid",
                                              "Aminated Fatty Acid",
                                              "Long-Chain Fatty Acid",
                                              "Dicarboxylic Acid") ~ "Other Fatty Acid",
                              subclass == "Indole" ~ "Tryptophan Metabs.",
                              subclass %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                              TRUE ~ subclass
                              )) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  ungroup() %>% 
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      # "Dicarboxylic Acid",          
      "Amino Acid",                   # 2
      "Bile Acid",                    # 3
      "Indole",                       # 4
      "Add. Cmpds.",                  # 5
      # "Phenolic Aromatic",          
      # "Kynurine Pathway",             
      "Vitamin"                       # 6
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Other Fatty Acid",             # 2
      "Amino Acid",                   # 3
      # "Aminated Fatty Acid",        
      # "Long-Chain Fatty Acid",      
      # "Dicarboxylic Acid",          
      "Primary Bile Acid",            # 4
      "Secondary Bile Acid",          # 5
      "Conjugated Bile Acid",         # 6
      "Tryptophan Metabs.",           # 7
      # "Phenolic Aromatic",            # 
      # "Kynurine Pathway",             # 
      "Vitamin",                      # 8
      "Add. Cmpds."                   # 9
    )
  )) %>% 
  arrange(class, subclass, p.adj, compound) %>%
  select(class, subclass, p.adj, compound) %>% 
  column_to_rownames(var = "compound") %>% 
  select(`Adjusted p-value` = p.adj) %>% 
  as.matrix() %>% 
Heatmap(name = "Significance (adjusted p-value)",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        col = pvalue_col_fun,        
        column_title = "KW Test",
        column_title_side = "bottom",
        column_title_rot = 90,
        show_column_names = FALSE,
        column_names_side = "top",
        rect_gp = gpar(col = "black", lwd = 0.2),
        row_gap = unit(3.5, "mm"),
        row_names_side = "left",
        row_order = heatmap_order$compound,
        row_split = heatmap_order$subclass,
        row_names_gp = gpar(fontsize = 19), # Compounds on y-axis
        show_row_names = TRUE,
        row_title_rot = 0,
        row_title_gp = gpar(fontsize = 0),
        heatmap_legend_param = list(at = seq(0.05, 0, -0.01)),
        width = unit(0.1, "in")
)

# Heatmap legend color
col_fun <- colorRamp2(breaks = c(-5, 0, 5), colors = c("#00aaad", "white", "#ad003a"))

# Global parameter for annotation 
# ht_opt$COLUMN_ANNO_PADDING <- unit(2.5, "mm")

gg_metab_heatmap <-
  heatmap_data %>% 
  mutate(compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>%
  left_join(heatmap_lookup) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1) %>% select(compound, p.adj, p), by = "compound") %>%
  mutate(class = case_when(class %in% c("Dicarboxylic Acid") ~ "Fatty Acid",
                           class %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                           TRUE ~ class),
         subclass = case_when(subclass %in% c("Branched-Chain Fatty Acid",
                                              "Aminated Fatty Acid",
                                              "Long-Chain Fatty Acid",
                                              "Dicarboxylic Acid") ~ "Other Fatty Acid",
                              subclass == "Indole" ~ "Tryptophan Metabs.",
                              subclass %in% c("Phenolic Aromatic", "Kynurine Pathway") ~ "Add. Cmpds.",
                              TRUE ~ subclass
                              )) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  ungroup() %>% 
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      # "Dicarboxylic Acid",          
      "Amino Acid",                   # 2
      "Bile Acid",                    # 3
      "Indole",                       # 4
      "Add. Cmpds.",                  # 5
      # "Phenolic Aromatic",          
      # "Kynurine Pathway",             
      "Vitamin"                       # 6
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Other Fatty Acid",             # 2
      "Amino Acid",                   # 3
      # "Aminated Fatty Acid",        
      # "Long-Chain Fatty Acid",      
      # "Dicarboxylic Acid",          
      "Primary Bile Acid",            # 4
      "Secondary Bile Acid",          # 5
      "Conjugated Bile Acid",         # 6
      "Tryptophan Metabs.",           # 7
      # "Phenolic Aromatic",            # 
      # "Kynurine Pathway",             # 
      "Vitamin",                      # 8
      "Add. Cmpds."                   # 9
    )
  )) %>%
  mutate(
    diversity_group_abv = as.character(diversity_group_abv),
    diversity_group_abv = ifelse(
      grepl(pattern = "Healthy", as.character(diversity_group_abv)),
      diversity_group_abv,
      paste(diversity_group_abv, "Diversity")
    ),
    diversity_group_abv = factor(diversity_group_abv, levels = c("Low Diversity", "Medium Diversity",
                                                                 "High Diversity", "Healthy Donor"))
  ) %>% 
  arrange(class, subclass, p.adj, compound) %>%
  pivot_wider(c(diversity_group_abv, db, patientID, Shannon), names_from = compound, values_from = heatmap_val) %>%
  group_by(patientID) %>% 
  arrange(patientID) %>% 
  ungroup() %>% 
  select(-`NA`) %>%
  distinct(patientID, .keep_all = TRUE) %>%
  group_by(diversity_group_abv) %>% 
  arrange(db, Shannon) %>% 
  ungroup() %>% 
  select(-Shannon, -db, -diversity_group_abv) %>% 
  column_to_rownames("patientID") %>%
  as.matrix() %>% 
  t() %>%
  Heatmap(
    name = "Fold Change (log2)",
    col = col_fun,
    na_col = "grey83",
    rect_gp = gpar(col = "grey40", lwd = 1.5),
    # column_names_gp = grid::gpar(fontsize = 16),
    column_gap = unit(2.5, "mm"),
    column_split = heatmap_labels,
    column_order = heatmap_column_order,
    column_title_gp = gpar(fontsize = 20), # Diversity group labels on top of plot
    column_title_rot = 0,
    cluster_columns = FALSE,
    show_column_names = FALSE,
    show_column_dend = FALSE,
    row_names_gp = gpar(fontsize = 16), # Compounds on y-axis
    row_title_gp = gpar(fontsize = 18), # Compound classes on y-axis
    row_title_side = "right",  # Place the compound classes on right side of y-axis 
    row_gap = unit(3.5, "mm"),
    row_names_side = c("left"),
    row_order = heatmap_order$compound,
    row_split = heatmap_order$subclass,
    show_row_names = TRUE,
    row_title_rot = 0,
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    heatmap_height = unit(24, "in"),
    heatmap_width =  unit(16.5, "in")
  )

gg_metab_heatmap_tot <- pvalue_adj + gg_metab_heatmap
# gg_metab_heatmap_tot


pdf(file = "./Results/Figure_2.pdf", height = 24, width = 24, onefile = F)
gg_metab_heatmap_tot
invisible(dev.off())

Supplemental Figure 2: Quant Metabolomics-Healthy Donor vs LT Patients

# Conversions of compounds
metab_conversions <- data.frame(
  compound = c(
    "Taurocholic Acid",
    "Glycocholic Acid",
    "Cholic Acid",
    "3-Oxolithocholic Acid",
    "Alloisolithocholic Acid",
    "Deoxycholic Acid",
    "Isodeoxycholic Acid",
    "Lithocholic Acid",
    "Kynurenic Acid",
    "Kynurenine",
    "Anthranilic Acid",
    "Desaminotyrosine",
    "Niacin",
    "Tyrosine",
    "Tryptamine",
    "Tryptophan",
    "Phenylalanine",
    "Acetate",
    "Butyrate",
    "Propionate",
    "Succinate"
  ),
  molar_mass__gmol = c(
    515.7,
    465.6,
    408.6,
    374.6,
    376.6,
    392.6,
    392.6,
    376.6,
    189.17,
    208.21,
    137.14,
    166.17,
    123.11,
    181.19,
    160.22,
    204.22,
    165.19,
    59.04,
    87.1,
    73.07,
    116.07
  )
)

metab_quant_converted <- metab_quant_anon %>%
  mutate(
    compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
    compound = str_to_title(compound)
  ) %>%
  mutate(units = case_when(compound %in% c(
    "Taurocholic Acid",
    "Glycocholic Acid",
    "Cholic Acid",
    "3-Oxolithocholic Acid",
    "Alloisolithocholic Acid",
    "Deoxycholic Acid",
    "Isodeoxycholic Acid",
    "Lithocholic Acid"
  ) ~ "ugmL",
  compound %in% c("Acetate", "Butyrate", "Succinate", "Propionate") ~ "mM",
  TRUE ~ "uM")) %>% 
  right_join(metab_conversions, by = "compound") %>% # right_join to only keep compounds we're interested in
  mutate(mvalue__mM = case_when(units == "ugmL" ~ (mvalue*               #ugmL
                                                  ((1000/                #1000 ml per L
                                                    1000000)/            #1000000 ug per gram
                                                    molar_mass__gmol)*   #gram per mol (molar mass)
                                                    1000                 #1000 mM per 1M                                            
                                                   ),
                                units == "uM" ~ mvalue/                  #uM
                                                1000,                    #1000uM per 1M
                                TRUE ~ mvalue #units are already in mM
                                )
         ) %>% 
  select(sampleID, compound, mvalue__mM)
  

metab_boxplot <- 
  metaphlan_df2 %>% 
  left_join(metaphlan_df_sumry) %>% 
  drop_na(taxid) %>% 
  select(sampleID, diversity_group_abv, db) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  left_join(metab_quant_converted) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound)) %>% 
  ungroup() %>%
  mutate(class = case_when(compound %in% c("Taurocholic Acid", "Glycocholic Acid") ~ "Conjugated Primary Bile Acid",
                           compound %in% c("Cholic Acid") ~ "Primary Bile Acid",
                           compound %in% c("3-Oxolithocholic Acid", "Alloisolithocholic Acid", "Deoxycholic Acid", "Isodeoxycholic Acid",
                                                "Lithocholic Acid") ~ "Secondary Bile Acid",
                           compound %in% c("Threonine", "Glycine", "Tyrosine", "Tyramine", "Serine", "Leucine", "Isoleucine",
                                               "Valine", "Phenylalanine", "Alanine", "Proline", "Aspartate", 
                                               "Methionine", "Glutamate", "Lysine", "Cysteine", "Tryptophan") ~ "Amino Acid",
                           compound %in% c("Acetate", "Butyrate", "Succinate", "Propionate") ~ "Short-Chain Fatty Acid",
                           compound %in% c("Kynurenic Acid", "Anthranilic Acid", "Kynurenine", "Tryptamine") ~ "Kynurenine Metabolite",
                           compound == "Desaminotyrosine" ~ "Phenolic Aromatic",
                           compound == "Niacin" ~ "B-Vitamin",
                              TRUE ~ "Indole"),
         compound = case_when(class == "Conjugated Primary Bile Acid" ~ paste(compound, "(1ËšConj. BA)"),
                              class == "Primary Bile Acid" ~ paste(compound, "(1Ëš BA)"),
                              class == "Secondary Bile Acid" ~ paste(compound, "(2Ëš BA)"),
                              class == "Short-Chain Fatty Acid" ~ paste(compound, "(SCFA)"),
                              class == "Amino Acid" ~ paste(compound, "(AA)"),
                              class == "Phenolic Aromatic" ~ paste(compound, "(Phen. Arom.)"),
                              class == "Indole" ~ paste0(compound, "(Indole)"),
                              class == "Kynurenine Metabolite" ~ paste(compound, "(Kyn. Metab.)"),
                              class == "B-Vitamin" ~ paste(compound, "(B-Vitamin)")
                              )) %>% 
  drop_na() %>% 
  group_by(compound) %>% 
  mutate(count = length(unique(diversity_group_abv))) %>% 
  filter(count == 4) %>% 
  select(-count) %>%
  mutate(compound = factor(
    compound,
    levels = c(
      "Acetate (SCFA)",                  # 1
      "Propionate (SCFA)",               # 2
      "Butyrate (SCFA)",                 # 3
      "Succinate (SCFA)",                # 4
      "Tyrosine (AA)",                   # 5
      "Tryptophan (AA)",                 # 6
      "Phenylalanine (AA)",              # 7
      "Cholic Acid (1Ëš BA)",             # 8
      "Glycocholic Acid (1ËšConj. BA)",   # 9
      "Taurocholic Acid (1ËšConj. BA)",   # 10
      "Deoxycholic Acid (2Ëš BA)",        # 11
      "Lithocholic Acid (2Ëš BA)",        # 12
      "Isodeoxycholic Acid (2Ëš BA)",     # 13
      "3-Oxolithocholic Acid (2Ëš BA)",   # 14
      "Alloisolithocholic Acid (2Ëš BA)", # 15
      "Desaminotyrosine (Phen. Arom.)",  # 16
      "Kynurenine (Kyn. Metab.)",        # 17
      "Anthranilic Acid (Kyn. Metab.)",  # 18
      "Tryptamine (Kyn. Metab.)",        # 19
      "Niacin (B-Vitamin)"               # 20
    )
  ),
  class = factor(
    class,
    levels = c(
      "Short-Chain Fatty Acid",          # 1
      "Amino Acid",                      # 2
      "Primary Bile Acid",               # 3
      "Conjugated Primary Bile Acid",    # 4
      "Secondary Bile Acid",             # 5
      "Indole",                          # 6
      "Phenolic Aromatic",               # 7
      "Kynurenine Metabolite",           # 8
      "B-Vitamin"                        # 9
    )
  )) %>% 
  filter(compound != "Kynurenic Acid") %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor")),
         diversity_group_abv = factor(diversity_group_abv, levels = c("Low", "Medium", "High", "Healthy Donor"))) %>% 
  filter(compound %in% c(
    "Acetate (SCFA)",
    "Butyrate (SCFA)",
    "Propionate (SCFA)",
    "Glycocholic Acid (1ËšConj. BA)",
    "Taurocholic Acid (1ËšConj. BA)",
    "Cholic Acid (1Ëš BA)",
    "Deoxycholic Acid (2Ëš BA)",
    "Lithocholic Acid (2Ëš BA)",
    "Alloisolithocholic Acid (2Ëš BA)"
    ))


metab_boxplot_stats <-
  metab_boxplot %>% 
  group_by(class, compound) %>% 
  rstatix::wilcox_test(mvalue__mM~diversity_group_abv,
                       comparisons = diversity_comps,
                       p.adjust.method = "none",
                       alternative= "two.sided") %>% 
  rstatix::adjust_pvalue(method = "BH") %>%
  rstatix::add_significance("p.adj") %>% 
  mutate(p.adj = ifelse(p.adj < 0.001, "p.adj < 0.001", paste("p.adj = ", round(p.adj, 2)))) %>% 
  mutate(y.position = case_when(group1 == "High" & group2 == "Healthy Donor" ~ 2.30,
                                group1 == "Medium" & group2 == "High" ~ 2.7,
                                group1 == "Low" & group2 == "High" ~ 3.1,
                                group1 == "Low" & group2 == "Medium" ~ 3.5)) # Set stats brackets to fixed points since the y-scale will be log10 transformed

# Summary statistics
metab_boxplot_summay_stats <-
tbl_summary(
  metab_boxplot %>%
    ungroup() %>%
    select(sampleID, diversity_group_abv,  compound, mvalue__mM) %>%
    pivot_wider(
      id_cols = c(sampleID, diversity_group_abv),
      names_from = "compound",
      values_from = "mvalue__mM"
    ) %>% column_to_rownames(var = "sampleID"),
    by = diversity_group_abv,
    type = all_continuous() ~ "continuous2",
    statistic = all_continuous() ~ c("{mean} ({sd})", "{median} ({p25}, {p75})", "{min}-{max}"),
    digits = all_continuous() ~ function(x) format(x, digits = 3, scientific = TRUE)
  ) %>% 
  bold_labels() %>% 
  italicize_levels() 

metab_boxplot_summay_stats %>%
    gtsummary::modify_caption("**Diversity Group Summary Statistics**")
Diversity Group Summary Statistics
Characteristic Low, N = 36 Medium, N = 39 High, N = 26 Healthy Donor, N = 21
Acetate (SCFA)
    Mean (SD) 6.54e+00 (9.29e+00) 2.12e+01 (2.46e+01) 3.33e+01 (2.40e+01) 4.82e+01 (1.85e+01)
    Median (IQR) 1.63e+00 (3.38e-01, 1.16e+01) 1.29e+01 (2.02e+00, 4.14e+01) 3.13e+01 (1.51e+01, 4.62e+01) 4.93e+01 (2.95e+01, 6.31e+01)
    Minimum-Maximum 0.0e+00-4.81e+01 0.0e+00-1.14e+02 6.5e-01-8.62e+01 2.0e+01-7.61e+01
Alloisolithocholic Acid (2Ëš BA)
    Mean (SD) 5.90e-04 (7.63e-04) 6.70e-04 (1.04e-03) 6.51e-03 (1.08e-02) 3.00e-02 (3.40e-02)
    Median (IQR) 2.79e-04 (0.00e+00, 7.97e-04) 1.59e-04 (0.00e+00, 1.02e-03) 1.18e-03 (0.00e+00, 6.52e-03) 1.23e-02 (7.41e-03, 3.20e-02)
    Minimum-Maximum 0e+00-2.55e-03 0e+00-5.12e-03 0e+00-4.28e-02 0e+00-1.07e-01
Butyrate (SCFA)
    Mean (SD) 4.19e-01 (3.80e-01) 2.36e+00 (3.22e+00) 5.88e+00 (6.08e+00) 1.53e+01 (1.01e+01)
    Median (IQR) 4.60e-01 (6.00e-02, 5.80e-01) 8.30e-01 (3.05e-01, 3.25e+00) 3.89e+00 (1.99e+00, 6.86e+00) 1.24e+01 (8.87e+00, 2.34e+01)
    Minimum-Maximum 0.0e+00-1.28e+00 0.0e+00-1.22e+01 4.0e-02-2.66e+01 1.7e+00-3.85e+01
Cholic Acid (1Ëš BA)
    Mean (SD) 4.81e-01 (1.33e+00) 8.09e-01 (1.80e+00) 6.09e-01 (2.50e+00) 2.08e-02 (4.12e-02)
    Median (IQR) 2.60e-02 (2.43e-03, 1.83e-01) 1.23e-01 (2.13e-02, 7.06e-01) 5.09e-03 (1.68e-03, 6.89e-02) 5.41e-03 (2.14e-03, 1.21e-02)
    Minimum-Maximum 0.00e+00-7.49e+00 1.10e-03-9.18e+00 0.00e+00-1.27e+01 4.85e-04-1.67e-01
Deoxycholic Acid (2Ëš BA)
    Mean (SD) 6.61e-04 (7.86e-04) 2.61e-02 (1.28e-01) 2.52e-01 (3.31e-01) 1.56e+00 (2.01e+00)
    Median (IQR) 3.53e-04 (3.82e-05, 1.13e-03) 6.62e-04 (5.09e-05, 3.40e-03) 1.01e-01 (4.12e-02, 2.63e-01) 9.39e-01 (4.81e-01, 1.79e+00)
    Minimum-Maximum 0.00e+00-2.75e-03 0.00e+00-8.02e-01 3.33e-03-1.11e+00 1.01e-01-8.87e+00
Glycocholic Acid (1ËšConj. BA)
    Mean (SD) 1.39e-01 (5.16e-01) 1.76e-01 (6.96e-01) 1.42e-03 (3.08e-03) 4.67e-03 (9.85e-03)
    Median (IQR) 4.51e-03 (1.66e-04, 2.43e-02) 2.13e-03 (8.40e-05, 5.12e-02) 2.04e-04 (0.00e+00, 1.50e-03) 5.80e-04 (8.38e-05, 2.90e-03)
    Minimum-Maximum 0e+00-2.93e+00 0e+00-4.31e+00 0e+00-1.45e-02 0e+00-4.17e-02
Lithocholic Acid (2Ëš BA)
    Mean (SD) 3.73e-04 (6.01e-04) 1.96e-02 (7.89e-02) 4.21e-01 (5.52e-01) 9.03e-01 (7.03e-01)
    Median (IQR) 0.00e+00 (0.00e+00, 5.58e-04) 6.37e-04 (1.99e-04, 1.94e-03) 1.46e-01 (4.75e-02, 6.92e-01) 6.88e-01 (4.32e-01, 1.14e+00)
    Minimum-Maximum 0.00e+00-1.75e-03 0.00e+00-4.41e-01 2.37e-03-2.24e+00 1.72e-01-2.86e+00
Propionate (SCFA)
    Mean (SD) 6.58e-01 (1.02e+00) 5.13e+00 (6.45e+00) 1.07e+01 (8.24e+00) 1.63e+01 (7.21e+00)
    Median (IQR) 4.35e-01 (1.38e-01, 6.20e-01) 1.87e+00 (5.15e-01, 7.83e+00) 9.50e+00 (3.95e+00, 1.70e+01) 1.76e+01 (1.04e+01, 2.06e+01)
    Minimum-Maximum 0.00e+00-5.81e+00 0.00e+00-2.39e+01 8.00e-02-2.80e+01 2.96e+00-2.89e+01
Taurocholic Acid (1ËšConj. BA)
    Mean (SD) 5.62e-02 (9.73e-02) 1.10e-01 (2.91e-01) 1.05e-03 (2.16e-03) 9.58e-04 (1.60e-03)
    Median (IQR) 9.42e-03 (2.48e-04, 6.03e-02) 1.32e-03 (1.45e-04, 2.19e-02) 2.91e-05 (0.00e+00, 5.43e-04) 2.33e-04 (0.00e+00, 1.24e-03)
    Minimum-Maximum 0e+00-4.01e-01 0e+00-1.60e+00 0e+00-9.71e-03 0e+00-6.73e-03
gt::gtsave(gtsummary::as_gt(metab_boxplot_summay_stats), file = "./Results/Metab_Quant_Summary_Stats.png",
           vwidth = 1500, vheight = 1000)

set.seed(123) # for consistent jittering of points
gg_metab_boxplot <-
ggboxplot(metab_boxplot,
          x = "diversity_group_abv",
          y = "mvalue__mM",
          fill = "db",
          color = "diversity_group_abv",
          alpha = 0.65,
          outlier.shape = NA,
          facet.by = c("class", "compound")) +
    theme(legend.text = et(size = 12, color = "black"),
          legend.title = et(size = 14, color = "black"),
          axis.title.x = eb(),
          axis.title.y = et(size = 12, color = "black"),
          panel.border = eb(),
          strip.background = er(colour="white", fill="white"),
          ) +
    geom_hline(yintercept = 0) + 
    geom_segment(aes(x = 0.35, y = 0, xend = 0.35, yend = Inf)) +
    facet_wrap(~compound, scales = "fixed") +
    stat_pvalue_manual(metab_boxplot_stats, 
                       tip.length = 0.015) +
  geom_point(
    data = metab_boxplot,
    aes(x = diversity_group_abv, y = mvalue__mM, color = diversity_group_abv),
    position = position_jitter(width = 0.2),
    size = 2,
    alpha = 0.65) +
    scale_fill_manual("Cohort", values = rev(pirate_colors)) +
    scale_color_manual("Diversity Group", values = diversity_group_colors) +
    scale_y_log10(limits = c(0.01, 5000),
                  labels = c(0.01, 0.1, 1, 10, 100, 1000),
                  breaks = c(0.01, 0.1, 1, 10, 100, 1000),
                  expand = expansion(mult = c(0.1, 0.2))) +
    ylab("Concentration (mM)\n")

gg_metab_boxplot

cairo_pdf(filename = "./Results/Supplemental_Figure_2.pdf", width = 14, height = 10, onefile = FALSE)

gg_metab_boxplot

invisible(dev.off())

Figure 5A: Enterococcus Expansion and Infection

gg_ecoc_all_patients <- peri_matrix_all %>%
    distinct(sampleID, .keep_all = T) %>%
    select(sampleID, patientID, bact_infection_present) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Enterococcus", ignore.case = T)) %>%
        count(sampleID, wt = pctseqs, name = "enterococcus_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    ungroup() %>%
    arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = paste0(Phylum, "-", Order, "-", Family, "-",
        Genus)) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    mutate(cum.pct = cumsum(pctseqs), y.text = (cum.pct + c(0,
        cum.pct[-length(cum.pct)]))/2) %>%
    ungroup() %>%
    dplyr::select(-cum.pct) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID),
        Genus = factor(Genus, levels = unique(Genus)), Genus = fct_relevel(Genus,
            "Firmicutes-Lactobacillales-Enterococcaceae-Enterococcus",
            after = Inf)) %>%
    arrange(-enterococcus_rel_abundance) %>%
    filter(bact_infection_present != "No Infection") %>%
    ggplot(aes(x = reorder(sampleID, -enterococcus_rel_abundance),
        y = pctseqs)) + geom_bar(aes(fill = Genus), stat = "identity") +
    theme_bw() + theme(legend.position = "none", axis.text.x = et(angle = 90,
    hjust = 0.5), strip.text.x = eb(), strip.background = eb(),
    axis.title.y = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 10)) + scale_fill_manual(values = metaphlan_pal2) +
    scale_y_continuous(expand = expansion(mult = c(0.005, 0.005))) +
    ylab("Shotgun Relative Abundance (%)\n") + xlab("") + facet_grid(~bact_infection_present,
    space = "free_x", scales = "free_x")
# gg_ecoc_all_patients


# Discrete heatmap of infections
ecoc_infx_orgs <- peri_criteria_all %>%
    filter(sampleID %in% peri_matrix_all$sampleID) %>%
    group_by(patientID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Enterococcus", ignore.case = T)) %>%
        count(sampleID, wt = pctseqs, name = "enterococcus_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    group_by(patientID, sampleID, organisms, org_presence) %>%
    dplyr::slice(1) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID)) %>%
    mutate(org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present,
        ignore.case = T), 0, org_presence)) %>%
    ungroup() %>%
    mutate(organisms = ifelse(bact_infection_present == "Yes" &
        org_presence == 0, "Other", organisms)) %>%
    arrange(-enterococcus_rel_abundance) %>%
    mutate(organisms = ifelse(grepl(x = organisms, pattern = "enterococcus|klebsiella|escherichia|proteus|citrobacter|culture",
        ignore.case = TRUE), organisms, "Other Bacterial Infection"),
        organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection",
            organisms))

ecoc_infx_orgs_order <- ecoc_infx_orgs %>%
    group_by(sampleID, patientID, organisms) %>%
    distinct(sampleID, patientID, .keep_all = T) %>%
    ungroup() %>%
    mutate(organisms = ifelse(grepl(organisms, pattern = "gallinarum"),
        "Other Bacterial Infection", organisms), org_colors = case_when(grepl(x = organisms,
        pattern = "enterococcus faecium", ignore.case = T) ~
        1, grepl(x = organisms, pattern = "enterococcus faecalis",
        ignore.case = T) ~ 2, grepl(x = organisms, pattern = "enterococcus avium",
        ignore.case = T) ~ 3, grepl(x = organisms, pattern = "klebsiella pneumoniae",
        ignore.case = T) ~ 4, grepl(x = organisms, pattern = "escherichia coli",
        ignore.case = T) ~ 5, grepl(x = organisms, pattern = "proteus mirabilis",
        ignore.case = T) ~ 6, grepl(x = organisms, pattern = "citrobacter freundii",
        ignore.case = T) ~ 7, grepl(x = organisms, pattern = "other bacterial infection|gallinarum",
        ignore.case = T) ~ 8, grepl(x = organisms, pattern = "culture negative",
        ignore.case = T) ~ 9, TRUE ~ 0), organisms = as.factor(organisms),
        organisms = factor(organisms, levels = c("Enterococcus faecium",
            "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
            "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
            "Other Bacterial Infection", "Culture Negative",
            "No Bacterial Infection")), org_colors = factor(org_colors,
            levels = c("1", "2", "3", "4", "5", "6", "7", "8",
                "9", "0"))) %>%
    left_join(ecoc_infx_orgs) %>%
    mutate(organisms = factor(organisms, levels = c("Enterococcus faecium",
        "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
        "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
        "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")),
        org_colors = factor(org_colors, levels = c("1", "2",
            "3", "4", "5", "6", "7", "8", "9", "0"))) %>%
    drop_na(organisms) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    ungroup() %>%
    filter(bact_infection_present == "Infection") %>%
    droplevels()

gg_ecoc_infx_orgs <- ecoc_infx_orgs_order %>%
    ggplot(., aes(x = reorder(sampleID, -enterococcus_rel_abundance),
        y = organisms, fill = as.factor(org_colors))) + geom_tile(color = "black") +
    theme_bw() + theme(axis.title.y = et(color = "black", size = 12),
    axis.text.y = et(color = "black", size = 10), axis.ticks.x = eb(),
    axis.text.x = eb(), axis.title.x = eb(), panel.grid = eb(),
    strip.text = eb()) + scale_fill_manual(values = c("#129246",
    "#0C7A3A", "#08592B", "#FF0000", "#CC0404", "#8A0202", "#5C0202",
    "#E6C66E", "#BD992D", "#00000000"), labels = c("Enterococcus faecium",
    "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
    "Escherichia coli", "Citrobacter freundii", "Proteus mirabilis",
    "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")) +
    labs(fill = "Infecting Organism", y = "Infecting Organism\n") +
    scale_y_discrete(expand = expansion(mult = c(0.005, 0.005)),
        limits = rev(levels(ecoc_infx_orgs_order$organisms))) +
    facet_grid(. ~ bact_infection_present, space = "free_x",
        scales = "free_x")

# gg_ecoc_infx_orgs

yingtools2::gg.stack(gg_ecoc_all_patients, gg_ecoc_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

pdf(file = "./Results/Figure_5A.pdf", height = 8, width = 10,
    onefile = F)

yingtools2::gg.stack(gg_ecoc_all_patients, gg_ecoc_infx_orgs,
    heights = c(1, 0.5), adjust.themes = TRUE)

invisible(dev.off())

# Save object for combining later
fig_5a <- yingtools2::gg.stack(gg_ecoc_all_patients + theme(plot.margin = margin(t = 5,
    r = 5, b = 5, l = 5)), gg_ecoc_infx_orgs + theme(legend.position = "none",
    plot.margin = margin(t = 5, r = 5, b = 5, l = 5)), heights = c(1,
    0.5), adjust.themes = TRUE, as.gtable = TRUE)

Figure 5C: Enterobacterales Expansion and Infection

gg_ebac_all_patients <- peri_matrix_all %>%
    distinct(sampleID, .keep_all = T) %>%
    select(sampleID, patientID, bact_infection_present) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Klebsiella pneumoniae|Escherichia coli|Citrobacter freundii|Proteus mirabilis")) %>%
        count(sampleID, wt = pctseqs, name = "enterobacterales_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    ungroup() %>%
    arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = paste0(Phylum, "-", Order, "-", Family, "-",
        Genus)) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    mutate(cum.pct = cumsum(pctseqs), y.text = (cum.pct + c(0,
        cum.pct[-length(cum.pct)]))/2) %>%
    ungroup() %>%
    dplyr::select(-cum.pct) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID),
        Genus = factor(Genus, levels = unique(Genus)), Genus = fct_relevel(Genus,
            c("Proteobacteria-Enterobacterales-Enterobacteriaceae-Citrobacter",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Enterobacter",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Escherichia",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Klebsiella",
                "Proteobacteria-Enterobacterales-Morganellaceae-Proteus"),
            after = Inf)) %>%
    arrange(-enterobacterales_rel_abundance) %>%
    filter(bact_infection_present == "Infection") %>%
    ggplot(aes(x = reorder(sampleID, -enterobacterales_rel_abundance),
        y = pctseqs)) + geom_bar(aes(fill = Genus), stat = "identity") +
    theme_bw() + theme(legend.position = "none", axis.text.x = et(angle = 90,
    hjust = 0.5), strip.text.x = eb(), strip.background = eb(),
    axis.title.y = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 10)) + scale_fill_manual(values = metaphlan_pal2) +
    scale_y_continuous(expand = expansion(mult = c(0.005, 0.005))) +
    ylab("Shotgun Relative Abundance (%)\n") + xlab("") + facet_grid(~bact_infection_present,
    space = "free_x", scales = "free_x")
# gg_ebac_all_patients

# Discrete heatmap of infections
ebac_infx_orgs <- peri_criteria_all %>%
    filter(sampleID %in% peri_matrix_all$sampleID) %>%
    group_by(patientID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Klebsiella pneumoniae|Escherichia coli|Citrobacter freundii|Proteus mirabilis")) %>%
        count(sampleID, wt = pctseqs, name = "enterobacterales_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    group_by(patientID, sampleID, organisms, org_presence) %>%
    dplyr::slice(1) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID)) %>%
    mutate(org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present,
        ignore.case = T), 0, org_presence)) %>%
    ungroup() %>%
    mutate(organisms = ifelse(bact_infection_present == "Yes" &
        org_presence == 0, "Other", organisms)) %>%
    arrange(-enterobacterales_rel_abundance) %>%
    mutate(organisms = ifelse(grepl(x = organisms, pattern = "enterococcus|klebsiella|escherichia|proteus|citrobacter|culture",
        ignore.case = TRUE), organisms, "Other Bacterial Infection"),
        organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection",
            organisms))

ebac_infx_orgs_order <- ebac_infx_orgs %>%
    group_by(sampleID, patientID, organisms) %>%
    distinct(sampleID, patientID, .keep_all = T) %>%
    ungroup() %>%
    mutate(organisms = ifelse(grepl(organisms, pattern = "gallinarum"),
        "Other Bacterial Infection", organisms), org_colors = case_when(grepl(x = organisms,
        pattern = "enterococcus faecium", ignore.case = T) ~
        1, grepl(x = organisms, pattern = "enterococcus faecalis",
        ignore.case = T) ~ 2, grepl(x = organisms, pattern = "enterococcus avium",
        ignore.case = T) ~ 3, grepl(x = organisms, pattern = "klebsiella pneumoniae",
        ignore.case = T) ~ 4, grepl(x = organisms, pattern = "escherichia coli",
        ignore.case = T) ~ 5, grepl(x = organisms, pattern = "proteus mirabilis",
        ignore.case = T) ~ 6, grepl(x = organisms, pattern = "citrobacter freundii",
        ignore.case = T) ~ 7, grepl(x = organisms, pattern = "other bacterial infection|gallinarum",
        ignore.case = T) ~ 8, grepl(x = organisms, pattern = "culture negative",
        ignore.case = T) ~ 9, TRUE ~ 0), organisms = as.factor(organisms),
        organisms = factor(organisms, levels = c("Klebsiella pneumoniae",
            "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
            "Enterococcus faecium", "Enterococcus faecalis",
            "Enterococcus avium", "Other Bacterial Infection",
            "Culture Negative", "No Bacterial Infection")), org_colors = factor(org_colors,
            levels = c("4", "5", "6", "7", "1", "2", "3", "8",
                "9", "0"))) %>%
    left_join(ebac_infx_orgs) %>%
    mutate(organisms = factor(organisms, levels = c("Klebsiella pneumoniae",
        "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
        "Enterococcus faecium", "Enterococcus faecalis", "Enterococcus avium",
        "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")),
        org_colors = factor(org_colors, levels = c("4", "5",
            "6", "7", "1", "2", "3", "8", "9", "0"))) %>%
    drop_na(organisms) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    ungroup() %>%
    filter(bact_infection_present == "Infection") %>%
    droplevels()

gg_ebac_infx_orgs <- ebac_infx_orgs_order %>%
    ggplot(., aes(x = reorder(sampleID, -enterobacterales_rel_abundance),
        y = organisms, fill = as.factor(org_colors))) + geom_tile(color = "black") +
    theme_bw() + theme(axis.title.y = et(color = "black", size = 12),
    axis.text.y = et(color = "black", size = 10), axis.ticks.x = eb(),
    axis.text.x = eb(), axis.title.x = eb(), panel.grid = eb(),
    strip.text = eb()) + scale_fill_manual(values = c("#FF0000",
    "#CC0404", "#8A0202", "#5C0202", "#129246", "#0C7A3A", "#08592B",
    "#E6C66E", "#BD992D", "#00000000"), labels = c("Klebsiella pneumoniae",
    "Escherichia coli", "Citrobacter freundii", "Proteus mirabilis",
    "Enterococcus faecium", "Enterococcus faecalis", "Enterococcus avium",
    "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")) +
    labs(fill = "Infecting Organism", y = "Infecting Organism\n") +
    scale_y_discrete(expand = expansion(mult = c(0.005, 0.005)),
        limits = rev(levels(ebac_infx_orgs_order$organisms))) +
    facet_grid(. ~ bact_infection_present, space = "free_x",
        scales = "free_x")

# gg_ebac_infx_orgs

yingtools2::gg.stack(gg_ebac_all_patients, gg_ebac_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

pdf(file = "./Results/Figure_5C.pdf", height = 8, width = 10,
    onefile = F)

yingtools2::gg.stack(gg_ebac_all_patients, gg_ebac_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

invisible(dev.off())

# Save object for combining later
fig_5c <- yingtools2::gg.stack(gg_ebac_all_patients + theme(plot.margin = margin(t = 5,
    r = 5, b = 5, l = 5)), gg_ebac_infx_orgs + theme(legend.position = "none",
    plot.margin = margin(t = 5, r = 5, b = 5, l = 5)), heights = c(1,
    0.5), adjust.themes = TRUE, as.gtable = TRUE)

Combine Figure 5A+B+C+D

pdf(file = "./Results/Figure_5ABCD.pdf", height = 16, width = 16,
    onefile = F)
cowplot::plot_grid(fig_5a, fig_5b + theme(plot.margin = margin(t = 5,
    r = 5, b = 25, l = 5)), fig_5c, fig_5d + theme(plot.margin = margin(t = 5,
    r = 5, b = 25, l = 5)), align = "h", axis = "bl", ncol = 2,
    rel_widths = c(1, 0.9))
invisible(dev.off())

Quant Metabolomics-Enterococcus Expansion

# Qual compounds that we can quantify and that are also
# significant in the volcano analysis for both
signif_compounds <- qual_tot_ecoc_expan %>%
    rownames_to_column(var = "compound") %>%
    filter(p.adj <= 0.05 & abs(log2fc_val) > 0.75) %>%
    distinct(compound)

metab_boxplot_ecoc_expan <- peri_matrix_all %>%
    select(sampleID, enterococcus_rel_abundance) %>%
    left_join(sample_lookup) %>%
    group_by(sampleID) %>%
    slice(1) %>%
    left_join(metab_quant_converted %>%
        filter(compound %in% signif_compounds$compound)) %>%
    filter(compound %!in% c("Kynurenine", "Anthranilic Acid")) %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound)) %>%
    ungroup() %>%
    mutate(db = ifelse(db == "HealthyDonor", "Healthy Donor",
        "Liver Transplant") %>%
        factor(., levels = c("Healthy Donor", "Liver Transplant")),
        class = case_when(compound %in% c("Taurocholic Acid",
            "Glycocholic Acid") ~ "Conjugated Primary Bile Acid",
            compound %in% c("Cholic Acid") ~ "Primary Bile Acid",
            compound %in% c("3-Oxolithocholic Acid", "Alloisolithocholic Acid",
                "Deoxycholic Acid", "Isodeoxycholic Acid", "Lithocholic Acid") ~
                "Secondary Bile Acid", compound %in% c("Threonine",
                "Glycine", "Tyrosine", "Tyramine", "Serine",
                "Leucine", "Isoleucine", "Valine", "Phenylalanine",
                "Alanine", "Proline", "Aspartate", "Methionine",
                "Glutamate", "Lysine", "Cysteine", "Tryptophan") ~
                "Amino Acid", compound %in% c("Acetate", "Butyrate",
                "Succinate", "Propionate") ~ "Short-Chain Fatty Acid",
            compound %in% c("Kynurenic Acid", "Anthranilic Acid",
                "Kynurenine", "Tryptamine") ~ "Kynurenine Metabolite",
            compound == "Desaminotyrosine" ~ "Phenolic Aromatic",
            compound == "Niacin" ~ "B-Vitamin", TRUE ~ "Indole")) %>%
    drop_na() %>%
    group_by(compound) %>%
    mutate(class = factor(class, levels = c("Conjugated Primary Bile Acid",
        "Primary Bile Acid", "Secondary Bile Acid", "Short-Chain Fatty Acid",
        "Amino Acid", "Phenolic Aromatic", "Indole", "Kynurenine Metabolite",
        "B-Vitamin")), compound = case_when(class == "Conjugated Primary Bile Acid" ~
        paste(compound, "(1ËšConj. BA)"), class == "Primary Bile Acid" ~
        paste(compound, "(1Ëš BA)"), class == "Secondary Bile Acid" ~
        paste(compound, "(2Ëš BA)"), class == "Short-Chain Fatty Acid" ~
        paste(compound, "(SCFA)"), class == "Amino Acid" ~ paste(compound,
        "(AA)"), class == "Phenolic Aromatic" ~ paste(compound,
        "(Phen. Arom.)"), class == "Indole" ~ paste0(compound,
        "(Indole)"), class == "Kynurenine Metabolite" ~ paste(compound,
        "(Kyn. Metab.)"), class == "B-Vitamin" ~ paste(compound,
        "(B-Vitamin)")), compound = factor(compound, levels = c("Acetate (SCFA)",
        "Butyrate (SCFA)", "Propionate (SCFA)", "Succinate (SCFA)",
        "Taurocholic Acid (1ËšConj. BA)", "Glycocholic Acid (1ËšConj. BA)",
        "Cholic Acid (1Ëš BA)", "3-Oxolithocholic Acid (2Ëš BA)",
        "Alloisolithocholic Acid (2Ëš BA)", "Deoxycholic Acid (2Ëš BA)",
        "Isodeoxycholic Acid (2Ëš BA)", "Lithocholic Acid (2Ëš BA)",
        "Kynurenic Acid (Kyn. Metab.)", "Kynurenine (Kyn. Metab.)",
        "Anthranilic Acid (Kyn. Metab.)", "Desaminotyrosine",
        "Niacin", "Tyrosine", "Tryptamine", "Tryptophan", "Phenylalanine"))) %>%
    ungroup() %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue__mM[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue__mM[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue__mM != 0)) %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_expansion ==
        "1", "Expansion", "No Expansion"))


metab_boxplot_summary_stats_ecoc_expan <- metab_boxplot_ecoc_expan %>%
    group_by(compound) %>%
    summarise(y.position = max(mvalue__mM) * 1.1)

metab_boxplot_stats_ecoc_expan <- metab_boxplot_ecoc_expan %>%
    group_by(class, compound) %>%
    rstatix::wilcox_test(mvalue__mM ~ enterococcus_expansion) %>%
    rstatix::adjust_pvalue(method = "BH") %>%
    rstatix::add_significance("p.adj") %>%
    mutate(p.adj = ifelse(p.adj < 0.001, "p.adj < 0.001", paste("p.adj = ",
        round(p.adj, 3)))) %>%
    add_xy_position(x = "enterococcus_expansion") %>%
    select(-y.position) %>%
    left_join(metab_boxplot_summary_stats_ecoc_expan)


# gg_metab_boxplot_ecoc_expan <-
# ggboxplot(metab_boxplot_ecoc_expan, x =
# 'enterococcus_expansion', y = 'mvalue__mM', fill =
# 'enterococcus_expansion', alpha = 0.65, outlier.shape =
# NA ) + theme(legend.text = et(size = 14, color =
# 'black'), legend.title = et(size = 14, color = 'black'),
# axis.title.x = eb(), axis.title.y = et(size = 16, color =
# 'black'), strip.text = et(size = 14, color = 'black'),
# strip.background = eb(), panel.border = eb(), axis.line =
# eb()) + facet_wrap(~compound, scales = 'free_y', nrow =
# 2) + annotate('segment', x = 0.35, xend = 0.35, y = 0,
# yend = Inf, colour = 'black', linewidth = 1) +
# annotate('segment', x = 0.35, xend = Inf, y = 0, yend =
# 0, colour = 'black') +
# stat_pvalue_manual(metab_boxplot_stats_ecoc_expan, label
# = '{p.adj}', bracket.size = 0.5) + geom_point(data =
# metab_boxplot_ecoc_expan, aes(x = enterococcus_expansion,
# y = mvalue__mM, fill = enterococcus_expansion), position
# = position_jitter(width = 0.2), size = 2, shape = 21,
# alpha = 0.65, color = 'black') +
# scale_fill_manual('Enterococcus', values = c('gray87',
# '#389458')) + scale_y_continuous(expand = expansion(mult
# = c(0.1, 0.2))) + ylab('Concentration (mM)\n')
# gg_metab_boxplot_ecoc_expan cairo_pdf(filename =
# './Results/Figure_5B.pdf', height = 8, width = 16,
# onefile = TRUE) gg_metab_boxplot_ecoc_expan
# invisible(dev.off())

Figure 3A: sPLS-DA Diversity Groups and Healthy Donors using Qualitative Metabolomics

diversity_metab_mat <-
  metab_qual_anon %>% filter(sampleID %in% first_samps$sampleID) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group_abv)) %>% 
  group_by(sampleID, compound, diversity_group_abv) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue, diversity_group_abv) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`, - diversity_group_abv) %>% 
  filter_all(any_vars(!is.na(.)))

diversity_metab_labs <-
  diversity_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group_abv)) %>% 
  droplevels() %>% 
  pull(diversity_group_abv)

dim(diversity_metab_mat) #101 93 (means 93 compounds and 101 LT patients/infections)
## [1] 101  93
length(diversity_metab_labs) #101 (means 101 LT patients/infections)
## [1] 101
# Begin model training
set.seed(1234)
diversity_train <- sample(1:nrow(diversity_metab_mat), as.integer(0.7*nrow(diversity_metab_mat))) # randomly select 70% samples in training
diversity_test <- setdiff(1:nrow(diversity_metab_mat), diversity_train) # rest is part of the test set

# store matrices into training and test set:
diversity_metab_mat.train <- diversity_metab_mat[diversity_train, ]
diversity_metab_mat.test <- diversity_metab_mat[diversity_test,]
diversity_metab_labs.train <- diversity_metab_labs[diversity_train]
diversity_metab_labs.test <- diversity_metab_labs[diversity_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
diversity_train_splsda <- mixOmics::splsda(diversity_metab_mat.train, diversity_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
diversity_train_plsda_perf <-
  perf(
    diversity_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  diversity_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 4 might be best for classification error rate and max.dist

# Number of optimal variables to select for each component
diversity_train_keepX <- c(1:10,  seq(20, 130, 10))

set.seed(123)
diversity_train_tune_splsda <-
  mixOmics::tune.splsda(
    diversity_metab_mat.train,
    diversity_metab_labs.train,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = diversity_train_keepX,
    nrepeat = 50
  )

plot(diversity_train_tune_splsda, col = color.jet(4))

diversity_train_error <- diversity_train_tune_splsda$error.rate
diversity_train_ncomp <- diversity_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
diversity_train_ncomp #1 component is optimal
## [1] 1
diversity_train_select_keepX <- diversity_train_tune_splsda$choice.keepX[1:ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp)]  # optimal number of variables to select per component
diversity_train_select_keepX
## comp1 comp2 
##    80     8
# Final Model
diversity_train_splsda_final <-
  mixOmics::splsda(diversity_metab_mat.train, diversity_metab_labs.train, ncomp = ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp), keepX = diversity_train_select_keepX)

# Test the model
diversity_predict_train_splsda_final <- predict(diversity_train_splsda_final, diversity_metab_mat.test, 
                                dist = "max.dist")

diversity_predict_train_comp2 <- diversity_predict_train_splsda_final$class$max.dist[,ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp)]

diversity_union <- union(diversity_predict_train_comp2, diversity_metab_labs.test)

confusionMatrix(table(factor(diversity_predict_train_comp2, diversity_union,
                             levels = c("Low", "Medium", "High")),
                      factor(diversity_metab_labs.test, diversity_union,
                             levels = c("Low", "Medium", "High"))))
## Confusion Matrix and Statistics
## 
##         
##          Medium High Low
##   Medium     10    4   0
##   High        3    6   3
##   Low         0    0   5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6774          
##                  95% CI : (0.4863, 0.8332)
##     No Information Rate : 0.4194          
##     P-Value [Acc > NIR] : 0.003321        
##                                           
##                   Kappa : 0.4992          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Medium Class: High Class: Low
## Sensitivity                 0.7692      0.6000     0.6250
## Specificity                 0.7778      0.7143     1.0000
## Pos Pred Value              0.7143      0.5000     1.0000
## Neg Pred Value              0.8235      0.7895     0.8846
## Prevalence                  0.4194      0.3226     0.2581
## Detection Rate              0.3226      0.1935     0.1613
## Detection Prevalence        0.4516      0.3871     0.1613
## Balanced Accuracy           0.7735      0.6571     0.8125
diversity_train_background <- background.predict(diversity_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
diversity_tot <- predict(diversity_train_splsda_final,
                        diversity_metab_mat,
                        dist = "max.dist")

diversity_tot_predict <- diversity_tot$class$max.dist[,ifelse(diversity_train_ncomp == 1, diversity_train_ncomp + 1, diversity_train_ncomp)]

diversity_tot_union <- union(diversity_tot_predict, diversity_metab_labs)

diversity_cm <- confusionMatrix(table(factor(diversity_tot_predict, diversity_tot_union,
                                        levels = c("Low", "Medium", "High")),
                      factor(diversity_metab_labs, diversity_tot_union,
                             levels = c("Low", "Medium", "High"))),
                        )
diversity_cm
## Confusion Matrix and Statistics
## 
##         
##          Medium Low High
##   Medium     30  11    2
##   Low         6  27    6
##   High        0   1   18
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7426         
##                  95% CI : (0.646, 0.8244)
##     No Information Rate : 0.3861         
##     P-Value [Acc > NIR] : 3.744e-13      
##                                          
##                   Kappa : 0.6044         
##                                          
##  Mcnemar's Test P-Value : 0.07057        
## 
## Statistics by Class:
## 
##                      Class: Medium Class: Low Class: High
## Sensitivity                 0.8333     0.6923      0.6923
## Specificity                 0.8000     0.8065      0.9867
## Pos Pred Value              0.6977     0.6923      0.9474
## Neg Pred Value              0.8966     0.8065      0.9024
## Prevalence                  0.3564     0.3861      0.2574
## Detection Rate              0.2970     0.2673      0.1782
## Detection Prevalence        0.4257     0.3861      0.1881
## Balanced Accuracy           0.8167     0.7494      0.8395
# Additional model measures
diversity_epi <- mltest::ml_test(predicted = factor(diversity_tot_predict, levels = c("Low", "Medium", "High")),
                                 true = factor(diversity_metab_labs, levels = c("Low", "Medium", "High")))

diversity_cm_names <- diversity_cm$table
colnames(diversity_cm_names) <- c("Actual\nLow", "Actual\nMedium", "Actual\nHigh")
rownames(diversity_cm_names) <- c("Predicted\nLow", "Predicted\nMedium", "Predicted\nHigh")

diversity_confusion_df <- diversity_cm_names %>% 
  t() 

# multiclass 95% CI
diversity_mc <- 
biostatUtil::multiClassCM(
  factor(diversity_metab_labs, levels = c("Low", "Medium", "High")),
  factor(diversity_tot_predict, levels = c("Low", "Medium", "High")),
  seed = 20,
  num.boot = 1000,
  conf.level = 0.95,
  digits = 2,
  method = "wilson"
)

diversity_mc_table <- diversity_mc$table %>% 
  as.data.frame() %>% 
  separate(., Low, into = c("X1", "X2", "X3", "X4"), sep = " ") %>% 
  select(Average, "Low_Avg" = X1, "Low_Lower" = X2, "Low_Upper" = X4, Medium, High) %>% 
  separate(., Medium, into = c("X1", "X2", "X3", "X4"), sep = " ") %>% 
  select(Average, Low_Avg, Low_Lower, Low_Upper, "Med_Avg" = X1, "Med_Lower" = X2, "Med_Upper" = X4, High) %>%
  separate(., High, into = c("X1", "X2", "X3", "X4"), sep = " ") %>% 
  select(Average, Low_Avg, Low_Lower, Low_Upper, Med_Avg, Med_Lower, Med_Upper, "High_Avg" = X1, "High_Lower" = X2, "High_Upper" = X4) %>% 
  mutate_all(funs(str_replace(., "\\(|\\)", ""))) %>% 
  mutate_if(is.character,as.numeric)

{
pdf(file = "./Results/Figure_3A.pdf", height = 10, width = 10)
plotIndiv(
  diversity_train_splsda_final,
  xlim = c(min(diversity_train_splsda_final$variates$X[,1])*1.05, max(diversity_train_splsda_final$variates$X[,1])*1.8),
  ylim = c(min(diversity_train_splsda_final$variates$X[,2])*1.15, max(diversity_train_splsda_final$variates$X[,2])*1.8),
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = diversity_train_background,
  col = c("#3A001E", "#8A0246", "#C20463"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Diversity Group",
  X.label = paste0("Component 1 (", round(diversity_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(diversity_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = -5.25,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  diversity_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -5.5,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  paste0("Overall ACC = ", 
                    paste0(formatC(round(diversity_cm$overall[1], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(diversity_cm$overall[3], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(diversity_cm$overall[4], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)

text(
  y = 3,
  x = 1.5,
  paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.7,
  x = 1.5,
  paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.4,
  x = 1.5,
  paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
# text(
#   y = 2.1,
#   x = 1.5,
#   paste0("Low Diversity OR = ", round(diversity_epi$DOR[1], 1)),
#     cex = 0.75, adj = 0, col = "#3A001E")

text(
  y = -4.7,
  x = 0,
  paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5,
  x = 0,
  paste0("Medium Diversity Sens. = ", round(diversity_epi$precision[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.3,
  x = 0,
  paste0("Medium Diversity Spec. = ", round(diversity_epi$specificity[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
# text(
#   y = -5.9,
#   x = 1.5,
#   paste0("Medium Diversity OR = ", round(diversity_epi$DOR[2], 1)),
#     cex = 0.75, adj = 0, col = "#8A0246")

text(
  y = 3,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.7,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Sens. = ", round(diversity_epi$precision[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.4,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
# text(
#   y = 2.1,
#   x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
#   paste0("High Diversity OR = ", round(diversity_epi$DOR[3], 1)),
#     cex = 0.75, adj = 0, col = "#C20463")

invisible(dev.off())
}

plotIndiv(
  diversity_train_splsda_final,
  xlim = c(min(diversity_train_splsda_final$variates$X[,1])*1.05, max(diversity_train_splsda_final$variates$X[,1])*1.8),
  ylim = c(min(diversity_train_splsda_final$variates$X[,2])*1.15, max(diversity_train_splsda_final$variates$X[,2])*1.8),
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = diversity_train_background,
  col = c("#3A001E", "#8A0246", "#C20463"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Diversity Group",
  X.label = paste0("Component 1 (", round(diversity_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(diversity_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = -5.25,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  diversity_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -5.5,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  paste0("Overall ACC = ", 
                    paste0(formatC(round(diversity_cm$overall[1], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(diversity_cm$overall[3], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(diversity_cm$overall[4], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)

text(
  y = 3,
  x = 1.5,
  paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.7,
  x = 1.5,
  paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.4,
  x = 1.5,
  paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
# text(
#   y = 2.1,
#   x = 1.5,
#   paste0("Low Diversity OR = ", round(diversity_epi$DOR[1], 1)),
#     cex = 0.75, adj = 0, col = "#3A001E")

text(
  y = -4.7,
  x = 0,
  paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5,
  x = 0,
  paste0("Medium Diversity Sens. = ", round(diversity_epi$precision[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.3,
  x = 0,
  paste0("Medium Diversity Spec. = ", round(diversity_epi$specificity[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
# text(
#   y = -5.9,
#   x = 1.5,
#   paste0("Medium Diversity OR = ", round(diversity_epi$DOR[2], 1)),
#     cex = 0.75, adj = 0, col = "#8A0246")

text(
  y = 3,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.7,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Sens. = ", round(diversity_epi$precision[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.4,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")

# text(
#   y = 2.1,
#   x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
#   paste0("High Diversity OR = ", round(diversity_epi$DOR[3], 1)),
#     cex = 0.75, adj = 0, col = "#C20463")

Figure 6A: sPLS-DA Enterococcus Expansion using Qualitative Metabolomics

ecoc_doms_metab_mat <-
  metab_qual_anon %>%
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
       compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  group_by(sampleID, compound) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  right_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             # enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  select(-domination) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`) %>% 
  filter_all(any_vars(!is.na(.)))

  
ecoc_doms_metab_labs <-
  ecoc_doms_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(peri_matrix_all %>% 
              group_by(patientID) %>% 
             mutate(domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             # enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1] 
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  pull(domination)

dim(ecoc_doms_metab_mat) #107 93 (means 93 compounds and 108 LT patients)
## [1] 107  93
length(ecoc_doms_metab_labs) #107 (means 108 LT patients)
## [1] 107
# Begin model training
set.seed(1234)
ecoc_doms_train <- sample(1:nrow(ecoc_doms_metab_mat), as.integer(0.7*nrow(ecoc_doms_metab_mat))) # randomly select 70% samples in training
ecoc_doms_test <- setdiff(1:nrow(ecoc_doms_metab_mat), ecoc_doms_train) # rest is part of the test set

# store matrices into training and test set:
ecoc_doms_metab_mat.train <- ecoc_doms_metab_mat[ecoc_doms_train, ]
ecoc_doms_metab_mat.test <- ecoc_doms_metab_mat[ecoc_doms_test,]
ecoc_doms_metab_labs.train <- ecoc_doms_metab_labs[ecoc_doms_train]
ecoc_doms_metab_labs.test <- ecoc_doms_metab_labs[ecoc_doms_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
ecoc_doms_train_splsda <- mixOmics::splsda(ecoc_doms_metab_mat.train, ecoc_doms_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
ecoc_doms_train_plsda_perf <-
  perf(
    ecoc_doms_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  ecoc_doms_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 1 or 4 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
ecoc_doms_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
ecoc_doms_train_tune_splsda <-
  mixOmics::tune.splsda(
    ecoc_doms_metab_mat.train,
    ecoc_doms_metab_labs.train,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = ecoc_doms_train_keepX,
    nrepeat = 50
  )

plot(ecoc_doms_train_tune_splsda, col = color.jet(4))

ecoc_doms_train_error <- ecoc_doms_train_tune_splsda$error.rate
ecoc_doms_train_ncomp <- ecoc_doms_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
# ecoc_doms_train_ncomp = 4 #4 components are optimal via visual inspection

ecoc_doms_train_select_keepX <- ecoc_doms_train_tune_splsda$choice.keepX[1:ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp)]  # optimal number of variables to select per component
ecoc_doms_train_select_keepX
## comp1 comp2 
##     1    20
# Final Model
ecoc_doms_train_splsda_final <-
  mixOmics::splsda(ecoc_doms_metab_mat.train, ecoc_doms_metab_labs.train, ncomp = ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp), keepX = ecoc_doms_train_select_keepX)

# Test the model
ecoc_doms_predict_train_splsda_final <- predict(ecoc_doms_train_splsda_final, ecoc_doms_metab_mat.test, 
                                dist = "max.dist")

ecoc_doms_predict_train_comp2 <- ecoc_doms_predict_train_splsda_final$class$max.dist[,ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp)]

ecoc_doms_union <- union(ecoc_doms_predict_train_comp2, ecoc_doms_metab_labs.test)

confusionMatrix(table(factor(ecoc_doms_predict_train_comp2, ecoc_doms_union),
                      factor(ecoc_doms_metab_labs.test, ecoc_doms_union)),
                 negative = "0")
## Confusion Matrix and Statistics
## 
##               
##                No Expansion Expansion
##   No Expansion           16         5
##   Expansion               3         9
##                                           
##                Accuracy : 0.7576          
##                  95% CI : (0.5774, 0.8891)
##     No Information Rate : 0.5758          
##     P-Value [Acc > NIR] : 0.02392         
##                                           
##                   Kappa : 0.4943          
##                                           
##  Mcnemar's Test P-Value : 0.72367         
##                                           
##             Sensitivity : 0.8421          
##             Specificity : 0.6429          
##          Pos Pred Value : 0.7619          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.5758          
##          Detection Rate : 0.4848          
##    Detection Prevalence : 0.6364          
##       Balanced Accuracy : 0.7425          
##                                           
##        'Positive' Class : No Expansion    
## 
ecoc_doms_train_background <- background.predict(ecoc_doms_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
ecoc_doms_tot <- predict(ecoc_doms_train_splsda_final,
                        ecoc_doms_metab_mat,
                        dist = "max.dist")

ecoc_doms_tot_predict <- ecoc_doms_tot$class$max.dist[,ifelse(ecoc_doms_train_ncomp == 1, ecoc_doms_train_ncomp + 1, ecoc_doms_train_ncomp)]

ecoc_doms_tot_union <- union(ecoc_doms_tot_predict, ecoc_doms_metab_labs)

ecoc_doms_cm <- confusionMatrix(table(factor(ecoc_doms_tot_predict, ecoc_doms_tot_union,
                                        levels = c("No Expansion","Expansion")),
                      factor(ecoc_doms_metab_labs, ecoc_doms_tot_union,
                             levels = c("No Expansion","Expansion"))),
                      positive = "Expansion")
ecoc_doms_cm
## Confusion Matrix and Statistics
## 
##               
##                No Expansion Expansion
##   No Expansion           59        14
##   Expansion               7        27
##                                           
##                Accuracy : 0.8037          
##                  95% CI : (0.7158, 0.8742)
##     No Information Rate : 0.6168          
##     P-Value [Acc > NIR] : 2.546e-05       
##                                           
##                   Kappa : 0.5709          
##                                           
##  Mcnemar's Test P-Value : 0.1904          
##                                           
##             Sensitivity : 0.6585          
##             Specificity : 0.8939          
##          Pos Pred Value : 0.7941          
##          Neg Pred Value : 0.8082          
##              Prevalence : 0.3832          
##          Detection Rate : 0.2523          
##    Detection Prevalence : 0.3178          
##       Balanced Accuracy : 0.7762          
##                                           
##        'Positive' Class : Expansion       
## 
# Additional model measures
ecoc_doms_epi <- epiR::epi.tests(table(ecoc_doms_tot_predict, ecoc_doms_metab_labs), conf.level = 0.95)

ecoc_doms_confusion_df <- ecoc_doms_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nExpansion",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Expansion",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nExpansion" = "Test +",
                "Predicted\nNo Expansion" = "Test -") %>% 
  column_to_rownames(var = "actual")



{
pdf(file = "./Results/Figure_6A.pdf", height = 10, width = 10)
plotIndiv(
  ecoc_doms_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ecoc_doms_train_background,
  col = c("#0C7A3A", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*1.05,
  x = min(ecoc_doms_train_splsda_final$variates$X[,1])*1.15,
  ecoc_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -3.5,
  x = 1.5 ,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#0C7A3A")
text(
  y = 3 ,
  x = 0.75,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.825,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("ACC = ", 
                    round(ecoc_doms_cm$overall[1]*100, 1), "% [",
                    round(ecoc_doms_cm$overall[3]*100, 1), "%, ",
                    round(ecoc_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.875,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Sens. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.925,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Spec. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.975,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("OR = ",
                    formatC(round(ecoc_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ecoc_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ecoc_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

invisible(dev.off())
}

plotIndiv(
  ecoc_doms_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ecoc_doms_train_background,
  col = c("#0C7A3A", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ecoc_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*1.05,
  x = min(ecoc_doms_train_splsda_final$variates$X[,1])*1.15,
  ecoc_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -3.5,
  x = 1.5 ,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#0C7A3A")
text(
  y = 3 ,
  x = 0.75,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.825,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("ACC = ", 
                    round(ecoc_doms_cm$overall[1]*100, 1), "% [",
                    round(ecoc_doms_cm$overall[3]*100, 1), "%, ",
                    round(ecoc_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.875,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Sens. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.925,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("Spec. = ", 
                    paste0(formatC(round(ecoc_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ecoc_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ecoc_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ecoc_doms_train_splsda_final$variates$X[,2])*0.975,
  x = max(ecoc_doms_train_splsda_final$variates$X[,1])*0.25,
  paste0("OR = ",
                    formatC(round(ecoc_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ecoc_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ecoc_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

Figure 6C: sPLS-DA Enterobacterales Expansion using Qualitative Metabolomics

ebac_doms_metab_mat <-
  metab_qual_anon %>%
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
       compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  group_by(sampleID, compound) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  right_join(peri_matrix_all %>% 
             mutate(domination = case_when(
               #enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  select(-domination) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`) %>% 
  filter_all(any_vars(!is.na(.)))

  
ebac_doms_metab_labs <-
  ebac_doms_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(peri_matrix_all %>% 
             mutate(domination = case_when(
               # enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  pull(domination)

dim(ebac_doms_metab_mat) #107 93 (means 93 compounds and 108 LT patients)
## [1] 107  93
length(ebac_doms_metab_labs) #107 (means 108 LT patients)
## [1] 107
# Begin model training
set.seed(1234)
ebac_doms_train <- sample(1:nrow(ebac_doms_metab_mat), as.integer(0.7*nrow(ebac_doms_metab_mat))) # randomly select 70% samples in training
ebac_doms_test <- setdiff(1:nrow(ebac_doms_metab_mat), ebac_doms_train) # rest is part of the test set

# store matrices into training and test set:
ebac_doms_metab_mat.train <- ebac_doms_metab_mat[ebac_doms_train, ]
ebac_doms_metab_mat.test <- ebac_doms_metab_mat[ebac_doms_test,]
ebac_doms_metab_labs.train <- ebac_doms_metab_labs[ebac_doms_train]
ebac_doms_metab_labs.test <- ebac_doms_metab_labs[ebac_doms_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
ebac_doms_train_splsda <- mixOmics::splsda(ebac_doms_metab_mat.train, ebac_doms_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
ebac_doms_train_plsda_perf <-
  perf(
    ebac_doms_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  ebac_doms_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 2 or 4 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
ebac_doms_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
ebac_doms_train_tune_splsda <-
  mixOmics::tune.splsda(
    ebac_doms_metab_mat.train,
    ebac_doms_metab_labs.train,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = ebac_doms_train_keepX,
    nrepeat = 50
  )

plot(ebac_doms_train_tune_splsda, col = color.jet(4))

ebac_doms_train_error <- ebac_doms_train_tune_splsda$error.rate
ebac_doms_train_ncomp <- ebac_doms_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
ebac_doms_train_ncomp #1 components are optimal
## [1] 1
ebac_doms_train_select_keepX <- ebac_doms_train_tune_splsda$choice.keepX[1:ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp)]  # optimal number of variables to select per component
ebac_doms_train_select_keepX
## comp1 comp2 
##     1    10
# Final Model
ebac_doms_train_splsda_final <-
  mixOmics::splsda(ebac_doms_metab_mat.train, ebac_doms_metab_labs.train, ncomp = ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp), keepX = ebac_doms_train_select_keepX)

# Test the model
ebac_doms_predict_train_splsda_final <- predict(ebac_doms_train_splsda_final, ebac_doms_metab_mat.test, 
                                dist = "max.dist")

ebac_doms_predict_train_comp2 <- ebac_doms_predict_train_splsda_final$class$max.dist[,ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp)]

ebac_doms_union <- union(ebac_doms_predict_train_comp2, ebac_doms_metab_labs.test)

confusionMatrix(table(factor(ebac_doms_predict_train_comp2, ebac_doms_union),
                      factor(ebac_doms_metab_labs.test, ebac_doms_union)),
                 negative = "0")
## Confusion Matrix and Statistics
## 
##               
##                Expansion No Expansion
##   Expansion            1            1
##   No Expansion         6           25
##                                           
##                Accuracy : 0.7879          
##                  95% CI : (0.6109, 0.9102)
##     No Information Rate : 0.7879          
##     P-Value [Acc > NIR] : 0.5994          
##                                           
##                   Kappa : 0.1413          
##                                           
##  Mcnemar's Test P-Value : 0.1306          
##                                           
##             Sensitivity : 0.14286         
##             Specificity : 0.96154         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.80645         
##              Prevalence : 0.21212         
##          Detection Rate : 0.03030         
##    Detection Prevalence : 0.06061         
##       Balanced Accuracy : 0.55220         
##                                           
##        'Positive' Class : Expansion       
## 
ebac_doms_train_background <- background.predict(ebac_doms_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
ebac_doms_tot <- predict(ebac_doms_train_splsda_final,
                        ebac_doms_metab_mat,
                        dist = "max.dist")

ebac_doms_tot_predict <- ebac_doms_tot$class$max.dist[,ifelse(ebac_doms_train_ncomp == 1, ebac_doms_train_ncomp + 1, ebac_doms_train_ncomp)]

ebac_doms_tot_union <- union(ebac_doms_tot_predict, ebac_doms_metab_labs)

ebac_doms_cm <- confusionMatrix(table(factor(ebac_doms_tot_predict, ebac_doms_tot_union,
                                        levels = c("No Expansion","Expansion")),
                      factor(ebac_doms_metab_labs, ebac_doms_tot_union,
                             levels = c("No Expansion","Expansion"))),
                      positive = "Expansion")
ebac_doms_cm
## Confusion Matrix and Statistics
## 
##               
##                Expansion No Expansion
##   Expansion           78           20
##   No Expansion         1            8
##                                           
##                Accuracy : 0.8037          
##                  95% CI : (0.7158, 0.8742)
##     No Information Rate : 0.7383          
##     P-Value [Acc > NIR] : 0.07337         
##                                           
##                   Kappa : 0.3496          
##                                           
##  Mcnemar's Test P-Value : 8.568e-05       
##                                           
##             Sensitivity : 0.9873          
##             Specificity : 0.2857          
##          Pos Pred Value : 0.7959          
##          Neg Pred Value : 0.8889          
##              Prevalence : 0.7383          
##          Detection Rate : 0.7290          
##    Detection Prevalence : 0.9159          
##       Balanced Accuracy : 0.6365          
##                                           
##        'Positive' Class : Expansion       
## 
# Additional model measures
ebac_doms_epi <- epiR::epi.tests(table(ebac_doms_tot_predict, ebac_doms_metab_labs), conf.level = 0.95)

ebac_doms_confusion_df <- ebac_doms_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nExpansion",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Expansion",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nExpansion" = "Test +",
                "Predicted\nNo Expansion" = "Test -") %>% 
  column_to_rownames(var = "actual")


{
pdf(file = "./Results/Figure_6C.pdf", height = 10, width = 10)
plotIndiv(
  ebac_doms_train_splsda_final,
  comp = c(1,2),
  ylim = c(-5,16),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ebac_doms_train_background,
  col = c("#FF0000", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-3,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-10,
  ebac_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 12,
  x = 1.25,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#FF0000")
text(
  y = -4,
  x = 0.5,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.25,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("ACC = ", 
                    round(ebac_doms_cm$overall[1]*100, 1), "% [",
                    round(ebac_doms_cm$overall[3]*100, 1), "%, ",
                    round(ebac_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.05,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Sens. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.85,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Spec. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.65,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("OR = ",
                    formatC(round(ebac_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ebac_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ebac_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

invisible(dev.off())
}

plotIndiv(
  ebac_doms_train_splsda_final,
  comp = c(1,2),
  ylim = c(-5,16),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = ebac_doms_train_background,
  col = c("#FF0000", "black"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics", 
  legend.title = "Expansion",
  X.label = paste0("Component 1 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(ebac_doms_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-3,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-10,
  ebac_doms_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 12,
  x = 1.25,
  substitute(bold("Expansion")), cex = 1.75, adj = 0, col = "#FF0000")
text(
  y = -4,
  x = 0.5,
  substitute(bold("No Expansion")), cex = 1.75, adj = 0, col = "black")
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.25,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("ACC = ", 
                    round(ebac_doms_cm$overall[1]*100, 1), "% [",
                    round(ebac_doms_cm$overall[3]*100, 1), "%, ",
                    round(ebac_doms_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-2.05,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Sens. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.85,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("Spec. = ", 
                    paste0(formatC(round(ebac_doms_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(ebac_doms_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(ebac_doms_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = min(ebac_doms_train_splsda_final$variates$X[,2])*-1.65,
  x = min(ebac_doms_train_splsda_final$variates$X[,1])*-12,
  paste0("OR = ",
                    formatC(round(ebac_doms_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(ebac_doms_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(ebac_doms_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

Figure 7A: sPLS-DA Any Bacterial Infection using Qualitative Metabolomics

infx_metab_mat <-
  metab_qual_anon %>%
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
       compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  right_join(peri_matrix_all %>% select(sampleID, bact_infection_present)) %>% 
  group_by(sampleID, compound, bact_infection_present) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue, bact_infection_present) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  select(-bact_infection_present) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`) %>% 
  filter_all(any_vars(!is.na(.)))

  
infx_metab_labs <-
  infx_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(peri_matrix_all %>% select(sampleID, bact_infection_present)) %>% 
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  pull(bact_infection_present)

dim(infx_metab_mat) #107 93 (means 93 compounds and 107 LT patients/infections)
## [1] 107  93
length(infx_metab_labs) #107 (means 107 LT patients/infections)
## [1] 107
# Begin model training
set.seed(1234)
infx_train <- sample(1:nrow(infx_metab_mat), as.integer(0.7*nrow(infx_metab_mat))) # randomly select 70% samples in training
infx_test <- setdiff(1:nrow(infx_metab_mat), infx_train) # rest is part of the test set

# store matrices into training and test set:
infx_metab_mat.train <- infx_metab_mat[infx_train, ]
infx_metab_mat.test <- infx_metab_mat[infx_test,]
infx_metab_labs.train <- infx_metab_labs[infx_train]
infx_metab_labs.test <- infx_metab_labs[infx_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
infx_train_splsda <- mixOmics::splsda(infx_metab_mat.train, infx_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 100-repeat cross validation
set.seed(1234)
infx_train_plsda_perf <-
  perf(
    infx_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  infx_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 1 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
infx_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
infx_train_tune_splsda <-
  mixOmics::tune.splsda(
    infx_metab_mat.train,
    infx_metab_labs.train,
    ncomp = 3, # Choose 3 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = infx_train_keepX,
    nrepeat = 50
  )

plot(infx_train_tune_splsda, col = color.jet(3))

infx_train_error <- infx_train_tune_splsda$error.rate
infx_train_ncomp <- infx_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
infx_train_ncomp #1 component is optimal
## [1] 1
infx_train_select_keepX <- infx_train_tune_splsda$choice.keepX[1:(infx_train_ncomp + 1)]  # optimal number of variables to select per component
infx_train_select_keepX
## comp1 comp2 
##    90     3
# Final Model
infx_train_splsda_final <-
  mixOmics::splsda(infx_metab_mat.train, infx_metab_labs.train, ncomp = (infx_train_ncomp + 1), keepX = infx_train_select_keepX)

# Test the model
infx_predict_train_splsda_final <- predict(infx_train_splsda_final, infx_metab_mat.test, 
                                dist = "max.dist")

infx_predict_train_comp2 <- infx_predict_train_splsda_final$class$max.dist[,(infx_train_ncomp + 1)]

infx_union <- union(infx_predict_train_comp2, infx_metab_labs.test)

confusionMatrix(table(factor(infx_predict_train_comp2, infx_union),
                      factor(infx_metab_labs.test, infx_union)),
                 positive = "Infection")
## Confusion Matrix and Statistics
## 
##               
##                No Infection Infection
##   No Infection           20         6
##   Infection               6         1
##                                          
##                Accuracy : 0.6364         
##                  95% CI : (0.4512, 0.796)
##     No Information Rate : 0.7879         
##     P-Value [Acc > NIR] : 0.9865         
##                                          
##                   Kappa : -0.0879        
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.1429         
##             Specificity : 0.7692         
##          Pos Pred Value : 0.1429         
##          Neg Pred Value : 0.7692         
##              Prevalence : 0.2121         
##          Detection Rate : 0.0303         
##    Detection Prevalence : 0.2121         
##       Balanced Accuracy : 0.4560         
##                                          
##        'Positive' Class : Infection      
## 
infx_train_background <- background.predict(infx_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
infx_tot <- predict(infx_train_splsda_final,
                        infx_metab_mat,
                        dist = "max.dist")

infx_tot_predict <- infx_tot$class$max.dist[,(infx_train_ncomp + 1)]

infx_tot_union <- union(infx_tot_predict, infx_metab_labs)

infx_cm <- confusionMatrix(table(factor(infx_tot_predict, infx_tot_union,
                                        levels = c("Infection", "No Infection")),
                      factor(infx_metab_labs, infx_tot_union,
                             levels = c("Infection", "No Infection"))),
                      positive = "Infection")
infx_cm
## Confusion Matrix and Statistics
## 
##               
##                No Infection Infection
##   No Infection           12         7
##   Infection              15        73
##                                           
##                Accuracy : 0.7944          
##                  95% CI : (0.7054, 0.8664)
##     No Information Rate : 0.7477          
##     P-Value [Acc > NIR] : 0.1582          
##                                           
##                   Kappa : 0.3958          
##                                           
##  Mcnemar's Test P-Value : 0.1356          
##                                           
##             Sensitivity : 0.9125          
##             Specificity : 0.4444          
##          Pos Pred Value : 0.8295          
##          Neg Pred Value : 0.6316          
##              Prevalence : 0.7477          
##          Detection Rate : 0.6822          
##    Detection Prevalence : 0.8224          
##       Balanced Accuracy : 0.6785          
##                                           
##        'Positive' Class : Infection       
## 
# Additional model measures
infx_epi <- epiR::epi.tests(table(infx_tot_predict, infx_metab_labs), conf.level = 0.95)

infx_confusion_df <- infx_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nInfection",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Infection",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nInfection" = "Test +",
                "Predicted\nNo Infection" = "Test -") %>% 
  column_to_rownames(var = "actual")

{
pdf(file = "./Results/Figure_7A.pdf", height = 10, width = 10)
plotIndiv(
  infx_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = infx_train_background,
  col = c("goldenrod", "gray75"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Infection Group",
  X.label = paste0("Component 1 (", round(infx_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(infx_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = max(infx_train_splsda_final$variates$X[,2])*0.65,
  x = max(infx_train_splsda_final$variates$X[,1])*0.4,
  infx_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 0,
  x = 6.5,
  substitute(bold("Infection")), cex = 1.75, adj = 0, col = "goldenrod")
text(
  y = 4,
  x = -4.5,
  substitute(bold("No Infection")), cex = 1.75, adj = 0, col = "gray70")
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.6,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("ACC = ", 
                    round(infx_cm$overall[1]*100, 1), "% [",
                    round(infx_cm$overall[3]*100, 1), "%, ",
                    round(infx_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.55,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Sens. = ", 
                    paste0(formatC(round(infx_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.5,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Spec. = ", 
                    paste0(formatC(round(infx_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.45,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("OR = ",
                    formatC(round(infx_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(infx_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(infx_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)
invisible(dev.off())
}

plotIndiv(
  infx_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = infx_train_background,
  col = c("goldenrod", "gray75"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Infection Group",
  X.label = paste0("Component 1 (", round(infx_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(infx_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = max(infx_train_splsda_final$variates$X[,2])*0.65,
  x = max(infx_train_splsda_final$variates$X[,1])*0.4,
  infx_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = 0,
  x = 6.5,
  substitute(bold("Infection")), cex = 1.75, adj = 0, col = "goldenrod")
text(
  y = 4,
  x = -4.5,
  substitute(bold("No Infection")), cex = 1.75, adj = 0, col = "gray70")
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.6,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("ACC = ", 
                    round(infx_cm$overall[1]*100, 1), "% [",
                    round(infx_cm$overall[3]*100, 1), "%, ",
                    round(infx_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.55,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Sens. = ", 
                    paste0(formatC(round(infx_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.5,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("Spec. = ", 
                    paste0(formatC(round(infx_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.45,
  x = max(infx_train_splsda_final$variates$X[,1])*0.5,
  paste0("OR = ",
                    formatC(round(infx_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(infx_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(infx_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

Combine Figure 6A + 6C: sPLS-DA Enterococcus and Enterobacterales Expansion

axis.title.cex = 3.275
axis.text.cex = 2.5
plot.text.cex = 2.5
plot.group.cex = 2.5
point.cex = 3

{
    pdf(file = "./Results/Figure_6AC.pdf", height = 17, width = 11)
    par(mfrow = c(2, 1), mar = c(5, 5, 1, 1))

    par(mar = c(5, 5, 2, 1))

    # Figure 6A
    plotIndiv(ecoc_doms_train_splsda_final, comp = c(1, 2), pch = 1,
        ind.names = FALSE, legend = FALSE, background = ecoc_doms_train_background,
        col = c("#0C7A3A", "black"), star = TRUE, point.lwd = 0.5,
        title = NULL, size.title = 1e-05, style = "graphics",
        legend.title = "Expansion", X.label = paste0("Component 1 (",
            round(ecoc_doms_train_splsda_final$prop_expl_var$X[1] *
                100), "%)"), Y.label = paste0("Component 2 (",
            round(ecoc_doms_train_splsda_final$prop_expl_var$X[2] *
                100), "%)"))
    addtable2plot(y = min(ecoc_doms_train_splsda_final$variates$X[,
        2]) * 1.05, x = min(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 1.15, ecoc_doms_confusion_df, bty = "o", display.rownames = TRUE,
        hlines = TRUE, vlines = TRUE, cex = 0.75, bg = "white")
    text(y = -3.5, x = 1.5, substitute(bold("Expansion")), cex = 1.75,
        adj = 0, col = "#0C7A3A")
    text(y = 3, x = 0.75, substitute(bold("No Expansion")), cex = 1.75,
        adj = 0, col = "black")
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.825, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("ACC = ", round(ecoc_doms_cm$overall[1] *
        100, 1), "% [", round(ecoc_doms_cm$overall[3] * 100,
        1), "%, ", round(ecoc_doms_cm$overall[4] * 100, 1), "%]"),
        cex = 0.75, adj = 0)
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.875, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("Sens. = ", paste0(formatC(round(ecoc_doms_epi$detail[2][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ecoc_doms_epi$detail[3][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ecoc_doms_epi$detail[4][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.925, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("Spec. = ", paste0(formatC(round(ecoc_doms_epi$detail[2][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ecoc_doms_epi$detail[3][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ecoc_doms_epi$detail[4][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ecoc_doms_train_splsda_final$variates$X[, 2]) *
        0.975, x = max(ecoc_doms_train_splsda_final$variates$X[,
        1]) * 0.25, paste0("OR = ", formatC(round(ecoc_doms_epi$detail[2][6,
        ], 3), digits = 1, format = "f"), " [", formatC(round(ecoc_doms_epi$detail[3][6,
        ], 3), digits = 1, format = "f"), ", ", formatC(round(ecoc_doms_epi$detail[4][6,
        ], 3), digits = 1, format = "f"), "]"), cex = 0.75, adj = 0)

    # Figure 6C
    par(mar = c(5, 5, 2, 1))
    plotIndiv(ebac_doms_train_splsda_final, comp = c(1, 2), ylim = c(-5,
        16), pch = 1, ind.names = FALSE, legend = FALSE, background = ebac_doms_train_background,
        col = c("#FF0000", "black"), star = TRUE, point.lwd = 0.5,
        title = NULL, size.title = 1e-05, style = "graphics",
        legend.title = "Expansion", X.label = paste0("Component 1 (",
            round(ebac_doms_train_splsda_final$prop_expl_var$X[1] *
                100), "%)"), Y.label = paste0("Component 2 (",
            round(ebac_doms_train_splsda_final$prop_expl_var$X[2] *
                100), "%)"))
    addtable2plot(y = min(ebac_doms_train_splsda_final$variates$X[,
        2]) * -3, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -10, ebac_doms_confusion_df, bty = "o", display.rownames = TRUE,
        hlines = TRUE, vlines = TRUE, cex = 0.75, bg = "white")
    text(y = 12, x = 1.25, substitute(bold("Expansion")), cex = 1.75,
        adj = 0, col = "#FF0000")
    text(y = -4, x = 0.5, substitute(bold("No Expansion")), cex = 1.75,
        adj = 0, col = "black")
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -2.25, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("ACC = ", round(ebac_doms_cm$overall[1] *
        100, 1), "% [", round(ebac_doms_cm$overall[3] * 100,
        1), "%, ", round(ebac_doms_cm$overall[4] * 100, 1), "%]"),
        cex = 0.75, adj = 0)
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -2.05, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("Sens. = ", paste0(formatC(round(ebac_doms_epi$detail[2][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ebac_doms_epi$detail[3][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ebac_doms_epi$detail[4][3,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -1.85, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("Spec. = ", paste0(formatC(round(ebac_doms_epi$detail[2][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(ebac_doms_epi$detail[3][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(ebac_doms_epi$detail[4][4,
        ], 3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 0.75,
        adj = 0)
    text(y = min(ebac_doms_train_splsda_final$variates$X[, 2]) *
        -1.65, x = min(ebac_doms_train_splsda_final$variates$X[,
        1]) * -12, paste0("OR = ", formatC(round(ebac_doms_epi$detail[2][6,
        ], 3), digits = 1, format = "f"), " [", formatC(round(ebac_doms_epi$detail[3][6,
        ], 3), digits = 1, format = "f"), ", ", formatC(round(ebac_doms_epi$detail[4][6,
        ], 3), digits = 1, format = "f"), "]"), cex = 0.75, adj = 0)

    invisible(dev.off())
}

Supplemental Figure 3: Plot Loadings for Diversity Groups

# Take absolute value to make diversity group loadings plot narrower
diversity_train_splsda_final$loadings$X <- abs(diversity_train_splsda_final$loadings$X)

# Combine Supplemental Figures 3A + 3B

{pdf(file = "./Results/Supplemental_Figure_3.pdf",
         height = 8, 
         width = 11)

par(mfrow = c(1,3))

# Supplmental Figure 3A
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 1,
  legend = FALSE,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 1.1,
  size.title = rel(1),
  ndisplay = 50
)

# Supplmental Figure 3B
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 2,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 1.1,
  size.title = rel(1),
  ndisplay = 50
)

invisible(dev.off())
}

par(mfrow = c(1,3))

# Supplmental Figure 3A
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 1,
  legend = FALSE,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 0.6,
  size.title = rel(1),
  ndisplay = 50
)

# Supplmental Figure 3B
plotLoadings(
  diversity_train_splsda_final,
  contrib = 'max',
  method = 'mean',
  comp = 2,
  legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
  size.name = 0.6,
  size.title = rel(1),
  ndisplay = 50
)

#### Metaphlan + Qual Metabolites ####
heatmap_data_raw <- t_metaphlan %>% 
  drop_na(taxid) %>% 
  select(sampleID) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant", "Healthy Donor")) %>%
  left_join(metab_qual_anon) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>% 
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  group_by(sampleID, compound) %>% 
  slice_max(mvalue, with_ties = F, n = 1) %>% 
  ungroup() %>%
  select(-db) %>% 
  left_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db)) %>%
               distinct(sampleID, db), by = "sampleID") %>% 
  group_by(sampleID, compound) %>% 
  slice_max(mvalue, with_ties = F, n = 1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup()

diversity_train_splsda_final_comp2 <- mixOmics::selectVar(diversity_train_splsda_final, comp = 2)$name

gg_diversity_comp2 <- metaphlan_df2 %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>%
  select(sampleID, diversity_group) %>% 
  distinct(sampleID, diversity_group) %>% 
  left_join(heatmap_data_raw) %>% 
  filter(compound %in% diversity_train_splsda_final_comp2 | is.na(compound)) %>% 
  mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity", "Medium Diversity",
                                                              "High Diversity", "Healthy Donor"))) %>%
  pivot_wider(id_cols = c(sampleID, diversity_group, patientID, db, Shannon), names_from = "compound", values_from = "mvalue", values_fill = NA) %>%  # For missing values, just for plotting
  pivot_longer(-c(sampleID, diversity_group, patientID, db, Shannon), names_to = "compound", values_to = "mvalue") %>% 
  mutate(compound = factor(compound, levels = c(diversity_train_splsda_final_comp2))) %>% 
  # filter(compound == "Omega-Muricholic Acid") %>% arrange(desc(mvalue))
  filter(compound != "NA") %>% 
  group_by(sampleID, compound) %>% 
  dplyr::slice(1) %>% 
  ggplot(aes(x = reorder(sampleID, Shannon), y = mvalue, fill = diversity_group)) +
  geom_col() +
  scale_fill_manual(values = diversity_group_colors) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x=eb(),
        axis.ticks.x=eb(),
        # strip.text.x= et(angle = 0, size = 14, color = "black"),
        strip.text.x = eb(),
        strip.background = eb(),
        strip.text.y = et(angle = 0, size = 14, color = "black", hjust = 0),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 5,
                             r = 5, 
                             b = 0, 
                             l = 5),
        panel.grid.minor = eb()) +
  facet_grid(compound~diversity_group, scales = "free")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005))) +
  ylab("Normalized Peak Area") +
  xlab("")

  

pdf(file = "./Results/Metahphlan_Medium_Diversity_Compounds.pdf", height = 16, width = 20, onefile = FALSE)
gg.stack(gg_metaphlan,
         gg_diversity_comp2,
         heights = c(1, 4))
dev.off()
## quartz_off_screen 
##                 2

Supplemental Figure 7A+B: Plot Loadings for Enterococcus Expansion

# Take absolute value to make diversity group loadings plot
# narrower
ecoc_doms_train_splsda_final$loadings$X <- abs(ecoc_doms_train_splsda_final$loadings$X)

# Supplemental Figure 7AB

{
    pdf(file = "./Results/Supplemental_Figure_7AB.pdf", height = 8,
        width = 11)

    par(mfrow = c(1, 3))

    plotLoadings(ecoc_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("#0C7A3A",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    plotLoadings(ecoc_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("#0C7A3A",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))


plotLoadings(ecoc_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("#0C7A3A", "black"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

plotLoadings(ecoc_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("#0C7A3A", "black"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Supplemental Figure 7C+D: Plot Loadings for Enterobacterales Expansion

# Take absolute value to make diversity group loadings plot
# narrower
ebac_doms_train_splsda_final$loadings$X <- abs(ebac_doms_train_splsda_final$loadings$X)

# Supplemental Figure 7CD

{
    pdf(file = "./Results/Supplemental_Figure_7CD.pdf", height = 8,
        width = 11)

    par(mfrow = c(1, 3))

    plotLoadings(ebac_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("#FF0000",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    plotLoadings(ebac_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("#FF0000",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))


plotLoadings(ebac_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("#FF0000", "black"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

plotLoadings(ebac_doms_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("#FF0000", "black"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Supplemental Figure 8A+B: Plot Loadings for Any Bacterial Infection

# Take absolute value to make diversity group loadings plot
# narrower
infx_train_splsda_final$loadings$X <- abs(infx_train_splsda_final$loadings$X)

{
    pdf(file = "./Results/Supplemental_Figure_8.pdf", height = 8,
        width = 11)

    par(mfrow = c(1, 3))

    # Supplmental Figure 8A
    plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
        comp = 1, legend = FALSE, legend.col = c("goldenrod",
            "gray75"), size.name = 1.1, size.title = rel(1),
        ndisplay = 50)

    # Supplmental Figure 8B
    plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
        comp = 2, legend.col = c("goldenrod", "gray75"), size.name = 1.1,
        size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))

# Supplmental Figure 8A
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("goldenrod", "gray75"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Supplmental Figure 8B
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("goldenrod", "gray75"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Tables

Supplemental Figure 5A: Flow Diagram

flow_chart <- flow_exclusions(incl_counts = c(158, 130, 107,
    25), total_label = "Total Patients Enrolled", incl_labels = c("Patients w/ Transplant",
    "Patients Included", "Patients w/ Bacterial Infection"),
    excl_labels = c("No Transplant", "No Sample In Study Period\nDay -7 to +30",
        "Patients w/o Bacterial Infection"), show_count = TRUE)

flow_chart
flow_chart %>%
    export_svg() %>%
    charToRaw() %>%
    rsvg_pdf("./Results/Supplemental_Figure_5A.pdf")

Demographics

## Vector of variables to summarize
demo_vars <- c("race", "sex", "age", "meld_transplant", "Alcoholic Hepatitis",
    "Alcoholic Cirrhosis", "NAFLD/NASH", "Primary Sclerosing Cholangitis",
    "Acute Viral Hepatitis", "Chronic Hepatitis B", "Chronic Hepatitis C",
    "Autoimmune", "Wilson's Disease", "Alpha-1 Antitrypsin",
    "Hemachromatosis", "Drug Induced Liver Injury or Toxin",
    "Budd Chiari", "Cryptogenic", "Malignancy", "Other", "Dialysis",
    "Pressers", "Mechanical Ventilation")
## Vector of categorical variables that need transformation
demo_cats <- c("race", "sex", "Alcoholic Hepatitis", "Alcoholic Cirrhosis",
    "NAFLD/NASH", "Primary Sclerosing Cholangitis", "Acute Viral Hepatitis",
    "Chronic Hepatitis B", "Chronic Hepatitis C", "Autoimmune",
    "Wilson's Disease", "Alpha-1 Antitrypsin", "Hemachromatosis",
    "Drug Induced Liver Injury or Toxin", "Budd Chiari", "Cryptogenic",
    "Malignancy", "Other", "Dialysis", "Pressers", "Mechanical Ventilation")


tab1_1 <- CreateTableOne(vars = demo_vars, testNonNormal = "wilcox.test",
    includeNA = TRUE, factorVars = demo_cats, strata = "any_infection",
    data = demo)

summary(tab1_1)  # Age is potentially skewed, need to state that it is skewed and re-run `CreateTableOne`
## 
##      ### Summary of continuous variables ###
## 
## any_infection: 0
##                  n miss p.miss mean sd median p25 p75 min max  skew kurt
## age             82    0      0   51 17     55  40  63   2  77 -1.00  0.8
## meld_transplant 82    0      0   26 10     28  19  33   6  49 -0.04 -0.7
## ------------------------------------------------------------ 
## any_infection: 1
##                  n miss p.miss mean sd median p25 p75 min max skew   kurt
## age             25    0      0   56 15     59  46  68  22  73 -0.9  0.002
## meld_transplant 25    0      0   27 10     30  19  33  11  42 -0.3 -1.057
## 
## p-values
##                   pNormal pNonNormal
## age             0.2072738  0.1765252
## meld_transplant 0.6624494  0.6085202
## 
## Standardize mean differences
##                    1 vs 2
## age             0.2976093
## meld_transplant 0.1025141
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## any_infection: 0
##                                 var  n miss p.miss
##                                race 82    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 82    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 82    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 82    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 82    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 82    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 82    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 82    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 82    0    0.0
##                                                   
##                                                   
##                          Autoimmune 82    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 82    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 82    0    0.0
##                                                   
##                     Hemachromatosis 82    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 82    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 82    0    0.0
##                                                   
##                         Cryptogenic 82    0    0.0
##                                                   
##                                                   
##                          Malignancy 82    0    0.0
##                                                   
##                                                   
##                               Other 82    0    0.0
##                                                   
##                                                   
##                            Dialysis 82    0    0.0
##                                                   
##                                                   
##                            Pressers 82    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 82    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    1     1.2         1.2
##              Asian/Mideast Indian    6     7.3         8.5
##            Black/African-American    9    11.0        19.5
##                More than one Race    8     9.8        29.3
##                  Patient Declined    4     4.9        34.1
##                           Unknown    0     0.0        34.1
##                             White   54    65.9       100.0
##                                                           
##                            Female   38    46.3        46.3
##                              Male   44    53.7       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   42    51.2        51.2
##                                 1   40    48.8       100.0
##                                                           
##                                 0   72    87.8        87.8
##                                 1   10    12.2       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   77    93.9        93.9
##                                 1    5     6.1       100.0
##                                                           
##                                 0   80    97.6        97.6
##                                 1    2     2.4       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   81    98.8        98.8
##                                 1    1     1.2       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   65    79.3        79.3
##                                 1   17    20.7       100.0
##                                                           
##                                 0   74    90.2        90.2
##                                 1    8     9.8       100.0
##                                                           
##                                 0   60    73.2        73.2
##                                 1   22    26.8       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   77    93.9        93.9
##                                 1    5     6.1       100.0
##                                                           
## ------------------------------------------------------------ 
## any_infection: 1
##                                 var  n miss p.miss
##                                race 25    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 25    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 25    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 25    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 25    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 25    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 25    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 25    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 25    0    0.0
##                                                   
##                                                   
##                          Autoimmune 25    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 25    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 25    0    0.0
##                                                   
##                     Hemachromatosis 25    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 25    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 25    0    0.0
##                                                   
##                         Cryptogenic 25    0    0.0
##                                                   
##                                                   
##                          Malignancy 25    0    0.0
##                                                   
##                                                   
##                               Other 25    0    0.0
##                                                   
##                                                   
##                            Dialysis 25    0    0.0
##                                                   
##                                                   
##                            Pressers 25    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 25    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    0     0.0         0.0
##              Asian/Mideast Indian    2     8.0         8.0
##            Black/African-American    2     8.0        16.0
##                More than one Race    2     8.0        24.0
##                  Patient Declined    1     4.0        28.0
##                           Unknown    2     8.0        36.0
##                             White   16    64.0       100.0
##                                                           
##                            Female    9    36.0        36.0
##                              Male   16    64.0       100.0
##                                                           
##                                 0   23    92.0        92.0
##                                 1    2     8.0       100.0
##                                                           
##                                 0   17    68.0        68.0
##                                 1    8    32.0       100.0
##                                                           
##                                 0   19    76.0        76.0
##                                 1    6    24.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   19    76.0        76.0
##                                 1    6    24.0       100.0
##                                                           
##                                 0   20    80.0        80.0
##                                 1    5    20.0       100.0
##                                                           
##                                 0   16    64.0        64.0
##                                 1    9    36.0       100.0
##                                                           
##                                 0   20    80.0        80.0
##                                 1    5    20.0       100.0
##                                                           
##                                 0   23    92.0        92.0
##                                 1    2     8.0       100.0
##                                                           
## 
## p-values
##                                      pApprox    pExact
## race                               0.3074905 0.4275466
## sex                                0.4953032 0.4904875
## Alcoholic Hepatitis                1.0000000 1.0000000
## Alcoholic Cirrhosis                0.2123469 0.1713637
## NAFLD/NASH                         0.2590605 0.1979409
## Primary Sclerosing Cholangitis     0.3704754 0.3322151
## Acute Viral Hepatitis              0.6007080 0.5709809
## Chronic Hepatitis B                0.5271112 0.2336449
## Chronic Hepatitis C                0.6007080 0.5709809
## Autoimmune                         0.4694777 0.5886832
## Wilson's Disease                   1.0000000 0.5538202
## Alpha-1 Antitrypsin                       NA        NA
## Hemachromatosis                    0.5271112 0.2336449
## Drug Induced Liver Injury or Toxin 1.0000000 1.0000000
## Budd Chiari                               NA        NA
## Cryptogenic                        1.0000000 1.0000000
## Malignancy                         0.9440592 0.7828238
## Other                              0.3063991 0.1774710
## Dialysis                           0.5266900 0.4513768
## Pressers                           0.1465607 0.1238754
## Mechanical Ventilation             1.0000000 0.6642869
## 
## Standardize mean differences
##                                        1 vs 2
## race                               0.45891730
## sex                                0.21130091
## Alcoholic Hepatitis                0.02568258
## Alcoholic Cirrhosis                0.34709743
## NAFLD/NASH                         0.31029007
## Primary Sclerosing Cholangitis     0.39735971
## Acute Viral Hepatitis              0.32025631
## Chronic Hepatitis B                0.28867513
## Chronic Hepatitis C                0.32025631
## Autoimmune                         0.36037499
## Wilson's Disease                   0.08851811
## Alpha-1 Antitrypsin                0.00000000
## Hemachromatosis                    0.28867513
## Drug Induced Liver Injury or Toxin 0.15713484
## Budd Chiari                        0.00000000
## Cryptogenic                        0.04264158
## Malignancy                         0.07849392
## Other                              0.29088216
## Dialysis                           0.19854168
## Pressers                           0.37578690
## Mechanical Ventilation             0.07437488
tableone_skewed <- c("age", "meld_transplant")

tab1_2 <- print(tab1_1, nonnormal = tableone_skewed, formatOptions = list(big.mark = ","))
##                                             Stratified by any_infection
##                                              0                   
##   n                                             82               
##   race (%)                                                       
##      American Indian or Alaska Native            1 (  1.2)       
##      Asian/Mideast Indian                        6 (  7.3)       
##      Black/African-American                      9 ( 11.0)       
##      More than one Race                          8 (  9.8)       
##      Patient Declined                            4 (  4.9)       
##      Unknown                                     0 (  0.0)       
##      White                                      54 ( 65.9)       
##   sex = Male (%)                                44 ( 53.7)       
##   age (median [IQR])                         55.00 [39.50, 63.00]
##   meld_transplant (median [IQR])             28.00 [19.00, 33.00]
##   Alcoholic Hepatitis = 1 (%)                    6 (  7.3)       
##   Alcoholic Cirrhosis = 1 (%)                   40 ( 48.8)       
##   NAFLD/NASH = 1 (%)                            10 ( 12.2)       
##   Primary Sclerosing Cholangitis = 1 (%)         6 (  7.3)       
##   Acute Viral Hepatitis = 1 (%)                  4 (  4.9)       
##   Chronic Hepatitis B = 1 (%)                    0 (  0.0)       
##   Chronic Hepatitis C = 1 (%)                    4 (  4.9)       
##   Autoimmune = 1 (%)                             5 (  6.1)       
##   Wilson's Disease = 1 (%)                       2 (  2.4)       
##   Alpha-1 Antitrypsin = 0 (%)                   82 (100.0)       
##   Hemachromatosis = 1 (%)                        0 (  0.0)       
##   Drug Induced Liver Injury or Toxin = 1 (%)     1 (  1.2)       
##   Budd Chiari = 0 (%)                           82 (100.0)       
##   Cryptogenic = 1 (%)                            4 (  4.9)       
##   Malignancy = 1 (%)                            17 ( 20.7)       
##   Other = 1 (%)                                  8 (  9.8)       
##   Dialysis = 1 (%)                              22 ( 26.8)       
##   Pressers = 1 (%)                               6 (  7.3)       
##   Mechanical Ventilation = 1 (%)                 5 (  6.1)       
##                                             Stratified by any_infection
##                                              1                    p     
##   n                                             25                      
##   race (%)                                                         0.307
##      American Indian or Alaska Native            0 (  0.0)              
##      Asian/Mideast Indian                        2 (  8.0)              
##      Black/African-American                      2 (  8.0)              
##      More than one Race                          2 (  8.0)              
##      Patient Declined                            1 (  4.0)              
##      Unknown                                     2 (  8.0)              
##      White                                      16 ( 64.0)              
##   sex = Male (%)                                16 ( 64.0)         0.495
##   age (median [IQR])                         59.00 [46.00, 68.00]  0.177
##   meld_transplant (median [IQR])             30.00 [19.00, 33.00]  0.609
##   Alcoholic Hepatitis = 1 (%)                    2 (  8.0)         1.000
##   Alcoholic Cirrhosis = 1 (%)                    8 ( 32.0)         0.212
##   NAFLD/NASH = 1 (%)                             6 ( 24.0)         0.259
##   Primary Sclerosing Cholangitis = 1 (%)         0 (  0.0)         0.370
##   Acute Viral Hepatitis = 1 (%)                  0 (  0.0)         0.601
##   Chronic Hepatitis B = 1 (%)                    1 (  4.0)         0.527
##   Chronic Hepatitis C = 1 (%)                    0 (  0.0)         0.601
##   Autoimmune = 1 (%)                             0 (  0.0)         0.469
##   Wilson's Disease = 1 (%)                       1 (  4.0)         1.000
##   Alpha-1 Antitrypsin = 0 (%)                   25 (100.0)            NA
##   Hemachromatosis = 1 (%)                        1 (  4.0)         0.527
##   Drug Induced Liver Injury or Toxin = 1 (%)     0 (  0.0)         1.000
##   Budd Chiari = 0 (%)                           25 (100.0)            NA
##   Cryptogenic = 1 (%)                            1 (  4.0)         1.000
##   Malignancy = 1 (%)                             6 ( 24.0)         0.944
##   Other = 1 (%)                                  5 ( 20.0)         0.306
##   Dialysis = 1 (%)                               9 ( 36.0)         0.527
##   Pressers = 1 (%)                               5 ( 20.0)         0.147
##   Mechanical Ventilation = 1 (%)                 2 (  8.0)         1.000
##                                             Stratified by any_infection
##                                              test   
##   n                                                 
##   race (%)                                          
##      American Indian or Alaska Native               
##      Asian/Mideast Indian                           
##      Black/African-American                         
##      More than one Race                             
##      Patient Declined                               
##      Unknown                                        
##      White                                          
##   sex = Male (%)                                    
##   age (median [IQR])                         nonnorm
##   meld_transplant (median [IQR])             nonnorm
##   Alcoholic Hepatitis = 1 (%)                       
##   Alcoholic Cirrhosis = 1 (%)                       
##   NAFLD/NASH = 1 (%)                                
##   Primary Sclerosing Cholangitis = 1 (%)            
##   Acute Viral Hepatitis = 1 (%)                     
##   Chronic Hepatitis B = 1 (%)                       
##   Chronic Hepatitis C = 1 (%)                       
##   Autoimmune = 1 (%)                                
##   Wilson's Disease = 1 (%)                          
##   Alpha-1 Antitrypsin = 0 (%)                       
##   Hemachromatosis = 1 (%)                           
##   Drug Induced Liver Injury or Toxin = 1 (%)        
##   Budd Chiari = 0 (%)                               
##   Cryptogenic = 1 (%)                               
##   Malignancy = 1 (%)                                
##   Other = 1 (%)                                     
##   Dialysis = 1 (%)                                  
##   Pressers = 1 (%)                                  
##   Mechanical Ventilation = 1 (%)
write.csv(tab1_2, "./Results/Demo_Table_1.csv", row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
tab1_2_padjust1 <- read.csv("./Results/Demo_Table_1.csv") %>%
    dplyr::rename(` ` = X, `No Infection` = X0, `Bacterial Infection` = X1)

tab1_2_padjust2 <- tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = tab1_2_padjust1$` `))

tab1_2_padjust3 <- tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & test == "", "chi.sq", test)) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(tab1_2_padjust3, "./Results/Demo_Table_1_padjust.csv",
    row.names = FALSE)

#### Stratify by diversity ####
demo_div <- demo %>%
    left_join(metaphlan_df_sumry %>%
        mutate(patientID = str_extract(string = sampleID, pattern = "lt-[0-9]+")) %>%
        distinct(patientID, diversity_group) %>%
        drop_na(patientID) %>%
        droplevels())

div_tab1_1 <- CreateTableOne(vars = demo_vars, testNonNormal = "kruskal.test",
    includeNA = TRUE, factorVars = demo_cats, strata = "diversity_group",
    data = demo_div)

summary(div_tab1_1)  # Age is potentially skewed, need to state that it is skewed and re-run `CreateTableOne`
## 
##      ### Summary of continuous variables ###
## 
## diversity_group: Low Diversity
##                  n miss p.miss mean sd median p25 p75 min max skew kurt
## age             40    0      0   50 16     54  38  62   2  77 -0.8  0.8
## meld_transplant 40    0      0   31  8     31  29  36   7  44 -0.9  1.1
## ------------------------------------------------------------ 
## diversity_group: Medium Diversity
##                  n miss p.miss mean sd median p25 p75 min max skew kurt
## age             40    0      0   54 16     58  47  65   2  72 -1.5  2.2
## meld_transplant 40    0      0   27 10     24  20  33   9  49  0.3 -0.3
## ------------------------------------------------------------ 
## diversity_group: High Diversity
##                  n miss p.miss mean sd median p25 p75 min max skew kurt
## age             27    0      0   52 17     60  39  68  12  73 -0.7 -0.4
## meld_transplant 27    0      0   20 11     18  12  28   6  46  0.8 -0.1
## 
## p-values
##                      pNormal   pNonNormal
## age             5.654067e-01 0.3575030890
## meld_transplant 6.927777e-05 0.0001636809
## 
## Standardize mean differences
##                   average    1 vs 2    1 vs 3     2 vs 3
## age             0.1577328 0.2387530 0.1575451 0.07690025
## meld_transplant 0.7501053 0.4792525 1.1430397 0.62802360
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## diversity_group: Low Diversity
##                                 var  n miss p.miss
##                                race 40    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 40    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 40    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 40    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 40    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 40    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 40    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 40    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 40    0    0.0
##                                                   
##                                                   
##                          Autoimmune 40    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 40    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 40    0    0.0
##                                                   
##                     Hemachromatosis 40    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 40    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 40    0    0.0
##                                                   
##                         Cryptogenic 40    0    0.0
##                                                   
##                                                   
##                          Malignancy 40    0    0.0
##                                                   
##                                                   
##                               Other 40    0    0.0
##                                                   
##                                                   
##                            Dialysis 40    0    0.0
##                                                   
##                                                   
##                            Pressers 40    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 40    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    0     0.0         0.0
##              Asian/Mideast Indian    3     7.5         7.5
##            Black/African-American    4    10.0        17.5
##                More than one Race    2     5.0        22.5
##                  Patient Declined    1     2.5        25.0
##                           Unknown    0     0.0        25.0
##                             White   30    75.0       100.0
##                                                           
##                            Female   20    50.0        50.0
##                              Male   20    50.0       100.0
##                                                           
##                                 0   35    87.5        87.5
##                                 1    5    12.5       100.0
##                                                           
##                                 0   20    50.0        50.0
##                                 1   20    50.0       100.0
##                                                           
##                                 0   35    87.5        87.5
##                                 1    5    12.5       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
##                                 0   38    95.0        95.0
##                                 1    2     5.0       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   37    92.5        92.5
##                                 1    3     7.5       100.0
##                                                           
##                                 0   38    95.0        95.0
##                                 1    2     5.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
##                                 0   37    92.5        92.5
##                                 1    3     7.5       100.0
##                                                           
##                                 0   35    87.5        87.5
##                                 1    5    12.5       100.0
##                                                           
##                                 0   23    57.5        57.5
##                                 1   17    42.5       100.0
##                                                           
##                                 0   32    80.0        80.0
##                                 1    8    20.0       100.0
##                                                           
##                                 0   35    87.5        87.5
##                                 1    5    12.5       100.0
##                                                           
## ------------------------------------------------------------ 
## diversity_group: Medium Diversity
##                                 var  n miss p.miss
##                                race 40    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 40    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 40    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 40    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 40    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 40    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 40    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 40    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 40    0    0.0
##                                                   
##                                                   
##                          Autoimmune 40    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 40    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 40    0    0.0
##                                                   
##                     Hemachromatosis 40    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 40    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 40    0    0.0
##                                                   
##                         Cryptogenic 40    0    0.0
##                                                   
##                                                   
##                          Malignancy 40    0    0.0
##                                                   
##                                                   
##                               Other 40    0    0.0
##                                                   
##                                                   
##                            Dialysis 40    0    0.0
##                                                   
##                                                   
##                            Pressers 40    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 40    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    1     2.5         2.5
##              Asian/Mideast Indian    2     5.0         7.5
##            Black/African-American    5    12.5        20.0
##                More than one Race    6    15.0        35.0
##                  Patient Declined    1     2.5        37.5
##                           Unknown    0     0.0        37.5
##                             White   25    62.5       100.0
##                                                           
##                            Female   18    45.0        45.0
##                              Male   22    55.0       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
##                                 0   17    42.5        42.5
##                                 1   23    57.5       100.0
##                                                           
##                                 0   34    85.0        85.0
##                                 1    6    15.0       100.0
##                                                           
##                                 0   38    95.0        95.0
##                                 1    2     5.0       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   40   100.0       100.0
##                                                           
##                                 0   36    90.0        90.0
##                                 1    4    10.0       100.0
##                                                           
##                                 0   32    80.0        80.0
##                                 1    8    20.0       100.0
##                                                           
##                                 0   37    92.5        92.5
##                                 1    3     7.5       100.0
##                                                           
##                                 0   29    72.5        72.5
##                                 1   11    27.5       100.0
##                                                           
##                                 0   38    95.0        95.0
##                                 1    2     5.0       100.0
##                                                           
##                                 0   39    97.5        97.5
##                                 1    1     2.5       100.0
##                                                           
## ------------------------------------------------------------ 
## diversity_group: High Diversity
##                                 var  n miss p.miss
##                                race 27    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 27    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 27    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 27    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 27    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 27    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 27    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 27    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 27    0    0.0
##                                                   
##                                                   
##                          Autoimmune 27    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 27    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 27    0    0.0
##                                                   
##                     Hemachromatosis 27    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 27    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 27    0    0.0
##                                                   
##                         Cryptogenic 27    0    0.0
##                                                   
##                                                   
##                          Malignancy 27    0    0.0
##                                                   
##                                                   
##                               Other 27    0    0.0
##                                                   
##                                                   
##                            Dialysis 27    0    0.0
##                                                   
##                                                   
##                            Pressers 27    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 27    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    0     0.0         0.0
##              Asian/Mideast Indian    3    11.1        11.1
##            Black/African-American    2     7.4        18.5
##                More than one Race    2     7.4        25.9
##                  Patient Declined    3    11.1        37.0
##                           Unknown    2     7.4        44.4
##                             White   15    55.6       100.0
##                                                           
##                            Female    9    33.3        33.3
##                              Male   18    66.7       100.0
##                                                           
##                                 0   25    92.6        92.6
##                                 1    2     7.4       100.0
##                                                           
##                                 0   22    81.5        81.5
##                                 1    5    18.5       100.0
##                                                           
##                                 0   22    81.5        81.5
##                                 1    5    18.5       100.0
##                                                           
##                                 0   24    88.9        88.9
##                                 1    3    11.1       100.0
##                                                           
##                                 0   26    96.3        96.3
##                                 1    1     3.7       100.0
##                                                           
##                                 0   27   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   23    85.2        85.2
##                                 1    4    14.8       100.0
##                                                           
##                                 0   25    92.6        92.6
##                                 1    2     7.4       100.0
##                                                           
##                                 0   26    96.3        96.3
##                                 1    1     3.7       100.0
##                                                           
##                                 0   27   100.0       100.0
##                                                           
##                                 0   27   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   27   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   27   100.0       100.0
##                                                           
##                                 0   27   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   15    55.6        55.6
##                                 1   12    44.4       100.0
##                                                           
##                                 0   22    81.5        81.5
##                                 1    5    18.5       100.0
##                                                           
##                                 0   24    88.9        88.9
##                                 1    3    11.1       100.0
##                                                           
##                                 0   26    96.3        96.3
##                                 1    1     3.7       100.0
##                                                           
##                                 0   26    96.3        96.3
##                                 1    1     3.7       100.0
##                                                           
## 
## p-values
##                                        pApprox      pExact
## race                               0.224346115 0.335681971
## sex                                0.396960524 0.451283920
## Alcoholic Hepatitis                0.235579722 0.270062209
## Alcoholic Cirrhosis                0.005029855 0.003784742
## NAFLD/NASH                         0.794836269 0.744506152
## Primary Sclerosing Cholangitis     0.316203824 0.334821352
## Acute Viral Hepatitis              0.840515121 1.000000000
## Chronic Hepatitis B                0.429385165 1.000000000
## Chronic Hepatitis C                0.002120594 0.003400761
## Autoimmune                         0.208909105 0.180931478
## Wilson's Disease                   0.378684095 0.467969872
## Alpha-1 Antitrypsin                         NA          NA
## Hemachromatosis                    0.429385165 1.000000000
## Drug Induced Liver Injury or Toxin 0.429385165 1.000000000
## Budd Chiari                                 NA          NA
## Cryptogenic                        0.116732808 0.249704864
## Malignancy                         0.001414085 0.001812476
## Other                              0.398340379 0.366166291
## Dialysis                           0.020393763 0.018754199
## Pressers                           0.037397761 0.045589684
## Mechanical Ventilation             0.153586594 0.219265323
## 
## Standardize mean differences
##                                       average     1 vs 2      1 vs 3     2 vs 3
## race                               0.58746504 0.44450333 0.615537269 0.70235452
## sex                                0.22799474 0.10025094 0.342997170 0.24073611
## Alcoholic Hepatitis                0.26167960 0.38669460 0.170722265 0.22762195
## Alcoholic Cirrhosis                0.57695699 0.15085094 0.703164058 0.87685597
## NAFLD/NASH                         0.11126301 0.07264327 0.166838145 0.09430761
## Primary Sclerosing Cholangitis     0.23496287 0.13187609 0.347035461 0.22597705
## Acute Viral Hepatitis              0.08830602 0.13187609 0.063569415 0.06947254
## Chronic Hepatitis B                0.15097027 0.22645541 0.226455407 0.00000000
## Chronic Hepatitis C                0.39317855 0.00000000 0.589767825 0.58976782
## Autoimmune                         0.26873969 0.40269363 0.003525424 0.40000000
## Wilson's Disease                   0.22178745 0.32444284 0.063569415 0.27735010
## Alpha-1 Antitrypsin                0.00000000 0.00000000 0.000000000 0.00000000
## Hemachromatosis                    0.15097027 0.22645541 0.226455407 0.00000000
## Drug Induced Liver Injury or Toxin 0.15097027 0.22645541 0.226455407 0.00000000
## Budd Chiari                        0.00000000 0.00000000 0.000000000 0.00000000
## Cryptogenic                        0.33716165 0.31362502 0.226455407 0.47140452
## Malignancy                         0.61335070 0.36910674 0.929014264 0.54193110
## Other                              0.22203541 0.16724840 0.166838145 0.33201970
## Dialysis                           0.50024151 0.31844694 0.757800685 0.42447691
## Pressers                           0.35009029 0.46569032 0.521011140 0.06356941
## Mechanical Ventilation             0.26093600 0.38669460 0.326640860 0.06947254
tableone_skewed <- c("age", "meld_transplant")

div_tab1_2 <- print(div_tab1_1, nonnormal = tableone_skewed,
    formatOptions = list(big.mark = ","))
##                                             Stratified by diversity_group
##                                              Low Diversity       
##   n                                             40               
##   race (%)                                                       
##      American Indian or Alaska Native            0 (  0.0)       
##      Asian/Mideast Indian                        3 (  7.5)       
##      Black/African-American                      4 ( 10.0)       
##      More than one Race                          2 (  5.0)       
##      Patient Declined                            1 (  2.5)       
##      Unknown                                     0 (  0.0)       
##      White                                      30 ( 75.0)       
##   sex = Male (%)                                20 ( 50.0)       
##   age (median [IQR])                         54.00 [37.75, 62.00]
##   meld_transplant (median [IQR])             31.00 [29.00, 36.00]
##   Alcoholic Hepatitis = 1 (%)                    5 ( 12.5)       
##   Alcoholic Cirrhosis = 1 (%)                   20 ( 50.0)       
##   NAFLD/NASH = 1 (%)                             5 ( 12.5)       
##   Primary Sclerosing Cholangitis = 1 (%)         1 (  2.5)       
##   Acute Viral Hepatitis = 1 (%)                  2 (  5.0)       
##   Chronic Hepatitis B = 1 (%)                    1 (  2.5)       
##   Chronic Hepatitis C = 1 (%)                    0 (  0.0)       
##   Autoimmune = 1 (%)                             3 (  7.5)       
##   Wilson's Disease = 1 (%)                       2 (  5.0)       
##   Alpha-1 Antitrypsin = 0 (%)                   40 (100.0)       
##   Hemachromatosis = 1 (%)                        1 (  2.5)       
##   Drug Induced Liver Injury or Toxin = 1 (%)     1 (  2.5)       
##   Budd Chiari = 0 (%)                           40 (100.0)       
##   Cryptogenic = 1 (%)                            1 (  2.5)       
##   Malignancy = 1 (%)                             3 (  7.5)       
##   Other = 1 (%)                                  5 ( 12.5)       
##   Dialysis = 1 (%)                              17 ( 42.5)       
##   Pressers = 1 (%)                               8 ( 20.0)       
##   Mechanical Ventilation = 1 (%)                 5 ( 12.5)       
##                                             Stratified by diversity_group
##                                              Medium Diversity    
##   n                                             40               
##   race (%)                                                       
##      American Indian or Alaska Native            1 (  2.5)       
##      Asian/Mideast Indian                        2 (  5.0)       
##      Black/African-American                      5 ( 12.5)       
##      More than one Race                          6 ( 15.0)       
##      Patient Declined                            1 (  2.5)       
##      Unknown                                     0 (  0.0)       
##      White                                      25 ( 62.5)       
##   sex = Male (%)                                22 ( 55.0)       
##   age (median [IQR])                         58.00 [46.75, 65.00]
##   meld_transplant (median [IQR])             24.50 [19.75, 33.00]
##   Alcoholic Hepatitis = 1 (%)                    1 (  2.5)       
##   Alcoholic Cirrhosis = 1 (%)                   23 ( 57.5)       
##   NAFLD/NASH = 1 (%)                             6 ( 15.0)       
##   Primary Sclerosing Cholangitis = 1 (%)         2 (  5.0)       
##   Acute Viral Hepatitis = 1 (%)                  1 (  2.5)       
##   Chronic Hepatitis B = 1 (%)                    0 (  0.0)       
##   Chronic Hepatitis C = 1 (%)                    0 (  0.0)       
##   Autoimmune = 1 (%)                             0 (  0.0)       
##   Wilson's Disease = 1 (%)                       0 (  0.0)       
##   Alpha-1 Antitrypsin = 0 (%)                   40 (100.0)       
##   Hemachromatosis = 1 (%)                        0 (  0.0)       
##   Drug Induced Liver Injury or Toxin = 1 (%)     0 (  0.0)       
##   Budd Chiari = 0 (%)                           40 (100.0)       
##   Cryptogenic = 1 (%)                            4 ( 10.0)       
##   Malignancy = 1 (%)                             8 ( 20.0)       
##   Other = 1 (%)                                  3 (  7.5)       
##   Dialysis = 1 (%)                              11 ( 27.5)       
##   Pressers = 1 (%)                               2 (  5.0)       
##   Mechanical Ventilation = 1 (%)                 1 (  2.5)       
##                                             Stratified by diversity_group
##                                              High Diversity       p     
##   n                                             27                      
##   race (%)                                                         0.224
##      American Indian or Alaska Native            0 (  0.0)              
##      Asian/Mideast Indian                        3 ( 11.1)              
##      Black/African-American                      2 (  7.4)              
##      More than one Race                          2 (  7.4)              
##      Patient Declined                            3 ( 11.1)              
##      Unknown                                     2 (  7.4)              
##      White                                      15 ( 55.6)              
##   sex = Male (%)                                18 ( 66.7)         0.397
##   age (median [IQR])                         60.00 [39.00, 68.00]  0.358
##   meld_transplant (median [IQR])             18.00 [12.00, 27.50] <0.001
##   Alcoholic Hepatitis = 1 (%)                    2 (  7.4)         0.236
##   Alcoholic Cirrhosis = 1 (%)                    5 ( 18.5)         0.005
##   NAFLD/NASH = 1 (%)                             5 ( 18.5)         0.795
##   Primary Sclerosing Cholangitis = 1 (%)         3 ( 11.1)         0.316
##   Acute Viral Hepatitis = 1 (%)                  1 (  3.7)         0.841
##   Chronic Hepatitis B = 1 (%)                    0 (  0.0)         0.429
##   Chronic Hepatitis C = 1 (%)                    4 ( 14.8)         0.002
##   Autoimmune = 1 (%)                             2 (  7.4)         0.209
##   Wilson's Disease = 1 (%)                       1 (  3.7)         0.379
##   Alpha-1 Antitrypsin = 0 (%)                   27 (100.0)            NA
##   Hemachromatosis = 1 (%)                        0 (  0.0)         0.429
##   Drug Induced Liver Injury or Toxin = 1 (%)     0 (  0.0)         0.429
##   Budd Chiari = 0 (%)                           27 (100.0)            NA
##   Cryptogenic = 1 (%)                            0 (  0.0)         0.117
##   Malignancy = 1 (%)                            12 ( 44.4)         0.001
##   Other = 1 (%)                                  5 ( 18.5)         0.398
##   Dialysis = 1 (%)                               3 ( 11.1)         0.020
##   Pressers = 1 (%)                               1 (  3.7)         0.037
##   Mechanical Ventilation = 1 (%)                 1 (  3.7)         0.154
##                                             Stratified by diversity_group
##                                              test   
##   n                                                 
##   race (%)                                          
##      American Indian or Alaska Native               
##      Asian/Mideast Indian                           
##      Black/African-American                         
##      More than one Race                             
##      Patient Declined                               
##      Unknown                                        
##      White                                          
##   sex = Male (%)                                    
##   age (median [IQR])                         nonnorm
##   meld_transplant (median [IQR])             nonnorm
##   Alcoholic Hepatitis = 1 (%)                       
##   Alcoholic Cirrhosis = 1 (%)                       
##   NAFLD/NASH = 1 (%)                                
##   Primary Sclerosing Cholangitis = 1 (%)            
##   Acute Viral Hepatitis = 1 (%)                     
##   Chronic Hepatitis B = 1 (%)                       
##   Chronic Hepatitis C = 1 (%)                       
##   Autoimmune = 1 (%)                                
##   Wilson's Disease = 1 (%)                          
##   Alpha-1 Antitrypsin = 0 (%)                       
##   Hemachromatosis = 1 (%)                           
##   Drug Induced Liver Injury or Toxin = 1 (%)        
##   Budd Chiari = 0 (%)                               
##   Cryptogenic = 1 (%)                               
##   Malignancy = 1 (%)                                
##   Other = 1 (%)                                     
##   Dialysis = 1 (%)                                  
##   Pressers = 1 (%)                                  
##   Mechanical Ventilation = 1 (%)
write.csv(div_tab1_2, "./Results/Demo_Table_1_Diversity.csv",
    row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
div_tab1_2_padjust1 <- read.csv("./Results/Demo_Table_1_Diversity.csv") %>%
    dplyr::rename(` ` = X, `Low Diversity` = Low.Diversity, `Medium Diversity` = Medium.Diversity,
        `High Diversity` = High.Diversity)

div_tab1_2_padjust2 <- div_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = div_tab1_2_padjust1$` `))

div_tab1_2_padjust3 <- div_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & test == "", "chi.sq", test)) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, div_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(div_tab1_2_padjust3, "./Results/Demo_Table_1_Diversity_padjust.csv",
    row.names = FALSE)

Culture: Supplemental Table 3

# Percentage of infections # where there is an infection
# within 30 days after transplant and not 0 days before
cult_percent_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", between(eday, 0,
        30)) %>%
    group_by(patientID, sampleID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(organisms != "No Bacterial Infection") %>%
    group_by(patientID, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    group_by(patientID, infx_stool, organisms) %>%
    dplyr::slice(1) %>%
    group_by(organisms) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    replace_na(list(group = "unknown")) %>%
    group_by(organisms) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

# Location of infections # where there is an infection
# within 30 days after transplant and not 0 days before
cult_location_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", between(eday, 0,
        30)) %>%
    group_by(patientID, eday, key) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, bact_infection_present, infx_stool, organism1,
        key) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:key), names_to = "organisms", values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(patientID, key, infx_stool, bact_infection_present) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    filter(organisms != "No Bacterial Infection") %>%
    group_by(patientID, infx_stool, key) %>%
    dplyr::slice(1) %>%
    group_by(key) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    replace_na(list(group = "unknown")) %>%
    group_by(key) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

# Type of Infection # where there is an infection within 30
# days after transplant and not 0 days before
cult_type_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", grepl(variable, pattern = "anatomy.+"),
        between(eday, 0, 30)) %>%
    group_by(patientID, eday, diag_cat2) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    mutate(diag_cat3 = case_when(grepl("abdominal", diag_cat2,
        ignore.case = T) ~ "Abdominal", grepl("vir|bronchitis|covid-19|cmv",
        diag_cat2, ignore.case = T) ~ "Viral", grepl("bacteremia",
        diag_cat2, ignore.case = T) ~ "Bacteremia", grepl("thrush",
        diag_cat2, ignore.case = T) ~ "Fungal", grepl("cholangitis|empyema|panniculitis|peritonitis|latent tb",
        diag_cat2, ignore.case = T) ~ "Bacterial", grepl("cystitis|pyelonephritis",
        diag_cat2, ignore.case = T) ~ "Urinary Tract Infection",
        grepl("pneumonia", diag_cat2, ignore.case = T) ~ "Pneumonia",
        TRUE ~ diag_cat2), diag_cat3 = str_to_title(diag_cat3)) %>%
    ungroup() %>%
    select(patientID, bact_infection_present, infx_stool, organism1,
        micro1.factor, key, diag_cat3) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:diag_cat3), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(patientID, diag_cat3, infx_stool, bact_infection_present) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1, organisms != "No Bacterial Infection") %>%
    ungroup() %>%
    group_by(patientID, infx_stool, diag_cat3) %>%
    dplyr::slice(1) %>%
    group_by(diag_cat3) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    group_by(diag_cat3) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

culture_tot <- bind_rows(cult_percent_infx_all, cult_location_infx_all,
    cult_type_infx_all) %>%
    mutate(percent = round(percent, 2))

# Read in csv to then append adjusted pvalues
write.csv(culture_tot, "./Results/Supplemental_Table_3.csv",
    row.names = FALSE)

Antibiotics

## Vector of variables to summarize
abx_vars <- c("Basiliximab", "Mycophenolate", "Steroid", "Systemic Vancomycin",
    "Tacrolimus", "Cefepime", "Metronidazole", "Piperacillin/Tazobactam",
    "Rifaximin", "Ceftriaxone", "Ciprofloxacin", "Gentamicin",
    "Tobramicin", "Daptomycin", "Meropenem", "Oral Vancomycin")

abx2 <- abx %>%
    filter(grepl(pattern = "basilix|tacro|steroid|mycophenolate|lactulose",
        medication_name, ignore.case = T) | grepl(pattern = "GLUCOCORTICOIDS|steroid",
        pharm_class, ignore.case = T) | grepl(pattern = "steroid",
        pharm_sub_class, ignore.case = T) | grepl("given", mar_action,
        ignore.case = T)) %>%
    mutate(Immunosuppressants = case_when(grepl("basilix", medication_name,
        ignore.case = T) ~ "Basiliximab", grepl("tacro", medication_name,
        ignore.case = T) ~ "Tacrolimus", grepl("one|ide|solu-cortef",
        medication_name, ignore.case = T) ~ "Steroid", grepl("mycophenolate",
        medication_name, ignore.case = T) ~ "Mycophenolate",
        grepl("lactulose", medication_name, ignore.case = T) ~
            "Lactulose"), Antibiotics = case_when(grepl("rifaximin",
        medication_name, ignore.case = T) ~ "Rifaximin", grepl("lactulose",
        medication_name, ignore.case = T) ~ "Lactulose", grepl("ceftriaxone",
        medication_name, ignore.case = T) ~ "Ceftriaxone", grepl("piperacillin|tazobactam",
        medication_name, ignore.case = T) ~ "Piperacillin/Tazobactam",
        grepl("cefepime", medication_name, ignore.case = T) ~
            "Cefepime", grepl("meropenem", medication_name, ignore.case = T) ~
            "Meropenem", grepl("gentamicin", medication_name,
            ignore.case = T) ~ "Gentamicin", grepl("tobramycin",
            medication_name, ignore.case = T) ~ "Tobramicin",
        grepl("vancomycin.+oral", medication_name, ignore.case = T) ~
            "Oral Vancomycin", grepl("vancomycin.+(IV|Intravenous)",
            medication_name, ignore.case = T) ~ "Systemic Vancomycin",
        grepl("METRONIDAZOLE", medication_name, ignore.case = T) ~
            "Metronidazole", grepl("DAPTOMYCIN", medication_name,
            ignore.case = T) ~ "Daptomycin", grepl("linezolid",
            medication_name, ignore.case = T) ~ "Linezolid",
        grepl("fluconazole", medication_name, ignore.case = T) ~
            "Fluconazole", grepl("micafungin", medication_name,
            ignore.case = T) ~ "Micafungin", grepl("cipro", medication_name,
            ignore.case = T) & !grepl("drop", dose_units, ignore.case = T) ~
            "Ciprofloxacin"), action = case_when(!is.na(Immunosuppressants) &
        between(days_transplant, 0, 30) | !is.na(Immunosuppressants) &
        ordering_mode == "Outpatient" ~ "keep", !is.na(Antibiotics) &
        between(days_transplant, -14, 1) ~ "keep", TRUE ~ "remove")) %>%
    group_by(patientID, Immunosuppressants, Antibiotics) %>%
    arrange(days_transplant) %>%
    filter(action == "keep") %>%
    dplyr::slice(1) %>%
    select(patientID, Immunosuppressants, Antibiotics) %>%
    left_join(peri_matrix_all %>%
        select(patientID, any_infection)) %>%
    pivot_longer(!c(patientID, any_infection), names_to = "variable",
        values_to = "value") %>%
    drop_na(value) %>%
    mutate(variable = 1) %>%
    pivot_wider(c(patientID, any_infection), names_from = "value",
        values_from = "variable", values_fn = min) %>%
    replace(is.na(.), 0)

abx_tab1_1 <- CreateTableOne(vars = abx_vars, testNonNormal = "wilcox.test",
    includeNA = FALSE, factorVars = abx_vars, strata = "any_infection",
    data = abx2)

summary(abx_tab1_1)
## 
##      ### Summary of categorical variables ### 
## 
## any_infection: 0
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 82    0    0.0     0   29    35.4        35.4
##                                             1   53    64.6       100.0
##                                                                       
##            Mycophenolate 82    0    0.0     0   17    20.7        20.7
##                                             1   65    79.3       100.0
##                                                                       
##                  Steroid 82    0    0.0     1   82   100.0       100.0
##                                                                       
##      Systemic Vancomycin 82    0    0.0     0   37    45.1        45.1
##                                             1   45    54.9       100.0
##                                                                       
##               Tacrolimus 82    0    0.0     0    1     1.2         1.2
##                                             1   81    98.8       100.0
##                                                                       
##                 Cefepime 82    0    0.0     0   52    63.4        63.4
##                                             1   30    36.6       100.0
##                                                                       
##            Metronidazole 82    0    0.0     0   46    56.1        56.1
##                                             1   36    43.9       100.0
##                                                                       
##  Piperacillin/Tazobactam 82    0    0.0     0   12    14.6        14.6
##                                             1   70    85.4       100.0
##                                                                       
##                Rifaximin 82    0    0.0     0   41    50.0        50.0
##                                             1   41    50.0       100.0
##                                                                       
##              Ceftriaxone 82    0    0.0     0   64    78.0        78.0
##                                             1   18    22.0       100.0
##                                                                       
##            Ciprofloxacin 82    0    0.0     0   66    80.5        80.5
##                                             1   16    19.5       100.0
##                                                                       
##               Gentamicin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
##               Tobramicin 82    0    0.0     0   78    95.1        95.1
##                                             1    4     4.9       100.0
##                                                                       
##               Daptomycin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
##                Meropenem 82    0    0.0     0   78    95.1        95.1
##                                             1    4     4.9       100.0
##                                                                       
##          Oral Vancomycin 82    0    0.0     0   80    97.6        97.6
##                                             1    2     2.4       100.0
##                                                                       
## ------------------------------------------------------------ 
## any_infection: 1
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 25    0    0.0     0    8    32.0        32.0
##                                             1   17    68.0       100.0
##                                                                       
##            Mycophenolate 25    0    0.0     0    4    16.0        16.0
##                                             1   21    84.0       100.0
##                                                                       
##                  Steroid 25    0    0.0     1   25   100.0       100.0
##                                                                       
##      Systemic Vancomycin 25    0    0.0     0    7    28.0        28.0
##                                             1   18    72.0       100.0
##                                                                       
##               Tacrolimus 25    0    0.0     0    0     0.0         0.0
##                                             1   25   100.0       100.0
##                                                                       
##                 Cefepime 25    0    0.0     0   13    52.0        52.0
##                                             1   12    48.0       100.0
##                                                                       
##            Metronidazole 25    0    0.0     0   12    48.0        48.0
##                                             1   13    52.0       100.0
##                                                                       
##  Piperacillin/Tazobactam 25    0    0.0     0    2     8.0         8.0
##                                             1   23    92.0       100.0
##                                                                       
##                Rifaximin 25    0    0.0     0   13    52.0        52.0
##                                             1   12    48.0       100.0
##                                                                       
##              Ceftriaxone 25    0    0.0     0   17    68.0        68.0
##                                             1    8    32.0       100.0
##                                                                       
##            Ciprofloxacin 25    0    0.0     0   21    84.0        84.0
##                                             1    4    16.0       100.0
##                                                                       
##               Gentamicin 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
##               Tobramicin 25    0    0.0     0   22    88.0        88.0
##                                             1    3    12.0       100.0
##                                                                       
##               Daptomycin 25    0    0.0     0   22    88.0        88.0
##                                             1    3    12.0       100.0
##                                                                       
##                Meropenem 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
##          Oral Vancomycin 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
## 
## p-values
##                            pApprox     pExact
## Basiliximab             0.94452638 0.81451961
## Mycophenolate           0.81509810 0.77620871
## Steroid                         NA         NA
## Systemic Vancomycin     0.19672525 0.16534958
## Tacrolimus              1.00000000 1.00000000
## Cefepime                0.42996989 0.35335166
## Metronidazole           0.62972047 0.50032437
## Piperacillin/Tazobactam 0.60142514 0.51269562
## Rifaximin               1.00000000 1.00000000
## Ceftriaxone             0.44773353 0.30160442
## Ciprofloxacin           0.91929331 0.77859454
## Gentamicin              0.95599591 0.41438900
## Tobramicin              0.42443880 0.35037337
## Daptomycin              0.05938894 0.03899733
## Meropenem               1.00000000 1.00000000
## Oral Vancomycin         1.00000000 0.55382019
## 
## Standardize mean differences
##                             1 vs 2
## Basiliximab             0.07126120
## Mycophenolate           0.12243021
## Steroid                 0.00000000
## Systemic Vancomycin     0.36127548
## Tacrolimus              0.15713484
## Cefepime                0.23261173
## Metronidazole           0.16262229
## Piperacillin/Tazobactam 0.21056770
## Rifaximin               0.04001601
## Ceftriaxone             0.22787388
## Ciprofloxacin           0.09200514
## Gentamicin              0.17507370
## Tobramicin              0.25833951
## Daptomycin              0.44449214
## Meropenem               0.04264158
## Oral Vancomycin         0.08851811
abx_tab1_2 <- print(abx_tab1_1, formatOptions = list(big.mark = ","))
##                                  Stratified by any_infection
##                                   0           1           p      test
##   n                               82          25                     
##   Basiliximab = 1 (%)             53 ( 64.6)  17 ( 68.0)   0.945     
##   Mycophenolate = 1 (%)           65 ( 79.3)  21 ( 84.0)   0.815     
##   Steroid = 1 (%)                 82 (100.0)  25 (100.0)      NA     
##   Systemic Vancomycin = 1 (%)     45 ( 54.9)  18 ( 72.0)   0.197     
##   Tacrolimus = 1 (%)              81 ( 98.8)  25 (100.0)   1.000     
##   Cefepime = 1 (%)                30 ( 36.6)  12 ( 48.0)   0.430     
##   Metronidazole = 1 (%)           36 ( 43.9)  13 ( 52.0)   0.630     
##   Piperacillin/Tazobactam = 1 (%) 70 ( 85.4)  23 ( 92.0)   0.601     
##   Rifaximin = 1 (%)               41 ( 50.0)  12 ( 48.0)   1.000     
##   Ceftriaxone = 1 (%)             18 ( 22.0)   8 ( 32.0)   0.448     
##   Ciprofloxacin = 1 (%)           16 ( 19.5)   4 ( 16.0)   0.919     
##   Gentamicin = 1 (%)               1 (  1.2)   1 (  4.0)   0.956     
##   Tobramicin = 1 (%)               4 (  4.9)   3 ( 12.0)   0.424     
##   Daptomycin = 1 (%)               1 (  1.2)   3 ( 12.0)   0.059     
##   Meropenem = 1 (%)                4 (  4.9)   1 (  4.0)   1.000     
##   Oral Vancomycin = 1 (%)          2 (  2.4)   1 (  4.0)   1.000
write.csv(abx_tab1_2, "./Results/ABX_Table_1.csv", row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
abx_tab1_2_padjust1 <- read.csv("./Results/ABX_Table_1.csv") %>%
    dplyr::rename(` ` = X, `No Infection` = X0, `Bacterial Infection` = X1)

abx_tab1_2_padjust2 <- abx_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = abx_tab1_2_padjust1$` `))

abx_tab1_2_padjust3 <- abx_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & is.na(test), "chi.sq", "")) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, abx_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(abx_tab1_2_padjust3, "./Results/ABX_Table_1_padjust.csv",
    row.names = FALSE)

#### Stratify by diversity ####
abx2_div <- abx2 %>%
    left_join(metaphlan_df_sumry %>%
        mutate(patientID = str_extract(string = sampleID, pattern = "lt-[0-9]+")) %>%
        distinct(patientID, diversity_group) %>%
        drop_na(patientID) %>%
        droplevels())

abx_div_tab1_1 <- CreateTableOne(vars = abx_vars, testNonNormal = "kruskal.test",
    includeNA = TRUE, factorVars = abx_vars, strata = "diversity_group",
    data = abx2_div)

summary(abx_div_tab1_1)
## 
##      ### Summary of categorical variables ### 
## 
## diversity_group: Low Diversity
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 40    0    0.0     0    9    22.5        22.5
##                                             1   31    77.5       100.0
##                                                                       
##            Mycophenolate 40    0    0.0     0    7    17.5        17.5
##                                             1   33    82.5       100.0
##                                                                       
##                  Steroid 40    0    0.0     1   40   100.0       100.0
##                                                                       
##      Systemic Vancomycin 40    0    0.0     0    7    17.5        17.5
##                                             1   33    82.5       100.0
##                                                                       
##               Tacrolimus 40    0    0.0     0    0     0.0         0.0
##                                             1   40   100.0       100.0
##                                                                       
##                 Cefepime 40    0    0.0     0   13    32.5        32.5
##                                             1   27    67.5       100.0
##                                                                       
##            Metronidazole 40    0    0.0     0   11    27.5        27.5
##                                             1   29    72.5       100.0
##                                                                       
##  Piperacillin/Tazobactam 40    0    0.0     0    8    20.0        20.0
##                                             1   32    80.0       100.0
##                                                                       
##                Rifaximin 40    0    0.0     0   10    25.0        25.0
##                                             1   30    75.0       100.0
##                                                                       
##              Ceftriaxone 40    0    0.0     0   21    52.5        52.5
##                                             1   19    47.5       100.0
##                                                                       
##            Ciprofloxacin 40    0    0.0     0   30    75.0        75.0
##                                             1   10    25.0       100.0
##                                                                       
##               Gentamicin 40    0    0.0     0   38    95.0        95.0
##                                             1    2     5.0       100.0
##                                                                       
##               Tobramicin 40    0    0.0     0   37    92.5        92.5
##                                             1    3     7.5       100.0
##                                                                       
##               Daptomycin 40    0    0.0     0   37    92.5        92.5
##                                             1    3     7.5       100.0
##                                                                       
##                Meropenem 40    0    0.0     0   36    90.0        90.0
##                                             1    4    10.0       100.0
##                                                                       
##          Oral Vancomycin 40    0    0.0     0   39    97.5        97.5
##                                             1    1     2.5       100.0
##                                                                       
## ------------------------------------------------------------ 
## diversity_group: Medium Diversity
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 40    0    0.0     0   15    37.5        37.5
##                                             1   25    62.5       100.0
##                                                                       
##            Mycophenolate 40    0    0.0     0    7    17.5        17.5
##                                             1   33    82.5       100.0
##                                                                       
##                  Steroid 40    0    0.0     1   40   100.0       100.0
##                                                                       
##      Systemic Vancomycin 40    0    0.0     0   23    57.5        57.5
##                                             1   17    42.5       100.0
##                                                                       
##               Tacrolimus 40    0    0.0     0    1     2.5         2.5
##                                             1   39    97.5       100.0
##                                                                       
##                 Cefepime 40    0    0.0     0   30    75.0        75.0
##                                             1   10    25.0       100.0
##                                                                       
##            Metronidazole 40    0    0.0     0   27    67.5        67.5
##                                             1   13    32.5       100.0
##                                                                       
##  Piperacillin/Tazobactam 40    0    0.0     0    3     7.5         7.5
##                                             1   37    92.5       100.0
##                                                                       
##                Rifaximin 40    0    0.0     0   21    52.5        52.5
##                                             1   19    47.5       100.0
##                                                                       
##              Ceftriaxone 40    0    0.0     0   35    87.5        87.5
##                                             1    5    12.5       100.0
##                                                                       
##            Ciprofloxacin 40    0    0.0     0   32    80.0        80.0
##                                             1    8    20.0       100.0
##                                                                       
##               Gentamicin 40    0    0.0     0   40   100.0       100.0
##                                             1    0     0.0       100.0
##                                                                       
##               Tobramicin 40    0    0.0     0   37    92.5        92.5
##                                             1    3     7.5       100.0
##                                                                       
##               Daptomycin 40    0    0.0     0   39    97.5        97.5
##                                             1    1     2.5       100.0
##                                                                       
##                Meropenem 40    0    0.0     0   40   100.0       100.0
##                                             1    0     0.0       100.0
##                                                                       
##          Oral Vancomycin 40    0    0.0     0   39    97.5        97.5
##                                             1    1     2.5       100.0
##                                                                       
## ------------------------------------------------------------ 
## diversity_group: High Diversity
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 27    0    0.0     0   13    48.1        48.1
##                                             1   14    51.9       100.0
##                                                                       
##            Mycophenolate 27    0    0.0     0    7    25.9        25.9
##                                             1   20    74.1       100.0
##                                                                       
##                  Steroid 27    0    0.0     1   27   100.0       100.0
##                                                                       
##      Systemic Vancomycin 27    0    0.0     0   14    51.9        51.9
##                                             1   13    48.1       100.0
##                                                                       
##               Tacrolimus 27    0    0.0     0    0     0.0         0.0
##                                             1   27   100.0       100.0
##                                                                       
##                 Cefepime 27    0    0.0     0   22    81.5        81.5
##                                             1    5    18.5       100.0
##                                                                       
##            Metronidazole 27    0    0.0     0   20    74.1        74.1
##                                             1    7    25.9       100.0
##                                                                       
##  Piperacillin/Tazobactam 27    0    0.0     0    3    11.1        11.1
##                                             1   24    88.9       100.0
##                                                                       
##                Rifaximin 27    0    0.0     0   23    85.2        85.2
##                                             1    4    14.8       100.0
##                                                                       
##              Ceftriaxone 27    0    0.0     0   25    92.6        92.6
##                                             1    2     7.4       100.0
##                                                                       
##            Ciprofloxacin 27    0    0.0     0   25    92.6        92.6
##                                             1    2     7.4       100.0
##                                                                       
##               Gentamicin 27    0    0.0     0   27   100.0       100.0
##                                             1    0     0.0       100.0
##                                                                       
##               Tobramicin 27    0    0.0     0   26    96.3        96.3
##                                             1    1     3.7       100.0
##                                                                       
##               Daptomycin 27    0    0.0     0   27   100.0       100.0
##                                             1    0     0.0       100.0
##                                                                       
##                Meropenem 27    0    0.0     0   26    96.3        96.3
##                                             1    1     3.7       100.0
##                                                                       
##          Oral Vancomycin 27    0    0.0     0   26    96.3        96.3
##                                             1    1     3.7       100.0
##                                                                       
## 
## p-values
##                              pApprox       pExact
## Basiliximab             8.508421e-02 8.535605e-02
## Mycophenolate           6.349040e-01 6.458911e-01
## Steroid                           NA           NA
## Systemic Vancomycin     5.716137e-04 4.324973e-04
## Tacrolimus              4.293852e-01 1.000000e+00
## Cefepime                1.979062e-05 2.581953e-05
## Metronidazole           8.987941e-05 8.806651e-05
## Piperacillin/Tazobactam 2.379309e-01 2.752945e-01
## Rifaximin               8.036339e-06 4.082253e-06
## Ceftriaxone             7.787926e-05 1.113622e-04
## Ciprofloxacin           1.868755e-01 1.767183e-01
## Gentamicin              1.814265e-01 3.369776e-01
## Tobramicin              7.882663e-01 7.925334e-01
## Daptomycin              2.476016e-01 4.468096e-01
## Meropenem               1.019801e-01 1.156897e-01
## Oral Vancomycin         9.477494e-01 1.000000e+00
## 
## Standardize mean differences
##                            average    1 vs 2     1 vs 3     2 vs 3
## Basiliximab             0.36842303 0.3318008 0.55702112 0.21644722
## Mycophenolate           0.13696235 0.0000000 0.20544352 0.20544352
## Steroid                 0.00000000 0.0000000 0.00000000 0.00000000
## Systemic Vancomycin     0.59828195 0.9072768 0.77392588 0.11364321
## Tacrolimus              0.15097027 0.2264554 0.00000000 0.22645541
## Cefepime                0.74607472 0.9422658 1.13838621 0.15757216
## Metronidazole           0.69063384 0.8741734 1.05278172 0.14494641
## Piperacillin/Tazobactam 0.24692356 0.3691067 0.24712083 0.12454313
## Rifaximin               0.95410379 0.5883952 1.51966251 0.75425370
## Ceftriaxone             0.66754595 0.8263939 1.00552172 0.17072227
## Ciprofloxacin           0.32802581 0.1199520 0.49164428 0.37248114
## Gentamicin              0.21629523 0.3244428 0.32444284 0.00000000
## Tobramicin              0.11043478 0.0000000 0.16565217 0.16565217
## Daptomycin              0.28669638 0.2309401 0.40269363 0.22645541
## Meropenem               0.33331297 0.4714045 0.25118429 0.27735010
## Oral Vancomycin         0.04631503 0.0000000 0.06947254 0.06947254
abx_div_tab1_2 <- print(abx_div_tab1_1, nonnormal = tableone_skewed,
    formatOptions = list(big.mark = ","))
##                                  Stratified by diversity_group
##                                   Low Diversity Medium Diversity High Diversity
##   n                               40            40               27            
##   Basiliximab = 1 (%)             31 ( 77.5)    25 ( 62.5)       14 ( 51.9)    
##   Mycophenolate = 1 (%)           33 ( 82.5)    33 ( 82.5)       20 ( 74.1)    
##   Steroid = 1 (%)                 40 (100.0)    40 (100.0)       27 (100.0)    
##   Systemic Vancomycin = 1 (%)     33 ( 82.5)    17 ( 42.5)       13 ( 48.1)    
##   Tacrolimus = 1 (%)              40 (100.0)    39 ( 97.5)       27 (100.0)    
##   Cefepime = 1 (%)                27 ( 67.5)    10 ( 25.0)        5 ( 18.5)    
##   Metronidazole = 1 (%)           29 ( 72.5)    13 ( 32.5)        7 ( 25.9)    
##   Piperacillin/Tazobactam = 1 (%) 32 ( 80.0)    37 ( 92.5)       24 ( 88.9)    
##   Rifaximin = 1 (%)               30 ( 75.0)    19 ( 47.5)        4 ( 14.8)    
##   Ceftriaxone = 1 (%)             19 ( 47.5)     5 ( 12.5)        2 (  7.4)    
##   Ciprofloxacin = 1 (%)           10 ( 25.0)     8 ( 20.0)        2 (  7.4)    
##   Gentamicin = 1 (%)               2 (  5.0)     0 (  0.0)        0 (  0.0)    
##   Tobramicin = 1 (%)               3 (  7.5)     3 (  7.5)        1 (  3.7)    
##   Daptomycin = 1 (%)               3 (  7.5)     1 (  2.5)        0 (  0.0)    
##   Meropenem = 1 (%)                4 ( 10.0)     0 (  0.0)        1 (  3.7)    
##   Oral Vancomycin = 1 (%)          1 (  2.5)     1 (  2.5)        1 (  3.7)    
##                                  Stratified by diversity_group
##                                   p      test
##   n                                          
##   Basiliximab = 1 (%)              0.085     
##   Mycophenolate = 1 (%)            0.635     
##   Steroid = 1 (%)                     NA     
##   Systemic Vancomycin = 1 (%)      0.001     
##   Tacrolimus = 1 (%)               0.429     
##   Cefepime = 1 (%)                <0.001     
##   Metronidazole = 1 (%)           <0.001     
##   Piperacillin/Tazobactam = 1 (%)  0.238     
##   Rifaximin = 1 (%)               <0.001     
##   Ceftriaxone = 1 (%)             <0.001     
##   Ciprofloxacin = 1 (%)            0.187     
##   Gentamicin = 1 (%)               0.181     
##   Tobramicin = 1 (%)               0.788     
##   Daptomycin = 1 (%)               0.248     
##   Meropenem = 1 (%)                0.102     
##   Oral Vancomycin = 1 (%)          0.948
write.csv(abx_div_tab1_2, "./Results/ABX_Table_1_Diversity.csv",
    row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
abx_div_tab1_2_padjust1 <- read.csv("./Results/ABX_Table_1_Diversity.csv") %>%
    dplyr::rename(` ` = X, `Low Diversity` = Low.Diversity, `Medium Diversity` = Medium.Diversity,
        `High Diversity` = High.Diversity)

abx_div_tab1_2_padjust2 <- abx_div_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = abx_div_tab1_2_padjust1$` `))

abx_div_tab1_2_padjust3 <- abx_div_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & is.na(test), "chi.sq", test)) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, abx_div_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(abx_div_tab1_2_padjust3, "./Results/ABX_Table_1_Diversity_padjust.csv",
    row.names = FALSE)

CARD Resistance Genes: Diversity Groups

div_card_tot <- card2 %>% left_join(card_dict)

# Put in order of increasing diversity
# Use qual metabolomic data order
card_heatmap_column_order <- heatmap_data %>% 
  ungroup() %>% 
  filter(grepl(x = patientID, pattern = "^lt")) %>% 
  group_by(patientID) %>% 
  dplyr::slice(1) %>% 
  ungroup() %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>%  
  ungroup() %>% 
  distinct(sampleID) %>% 
  pull(sampleID)

# AMRGeneFamily level
div_card_family_tot_0 <-
  div_card_tot %>%
  filter(sampleID %in% card_heatmap_column_order) %>% 
  filter(grepl(DrugClass,  pattern = "glycopeptide|oxazolidinone")#,
         # !grepl(ResistanceMechanism, pattern = "target")
         ) %>%
  group_by(sampleID, AMRGeneFamily) %>%
  mutate(rpkm_mean = mean(rpkm)) %>%
  dplyr::slice_max(rpkm_mean, n = 1, with_ties = F) %>%
  ungroup() %>%
  pivot_wider(id_cols = c(sampleID, rpkm_mean), names_from = "AMRGeneFamily", values_from = "rpkm", values_fill = 0) %>%
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  select(where(~ any(. != 0))) %>%
  pivot_longer(-c(sampleID, rpkm_mean), names_to = "AMRGeneFamily", values_to = "rpkm") %>%
  group_by(sampleID, AMRGeneFamily) %>%
  slice_max(rpkm, n = 1, with_ties = F) %>%
  ungroup() %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>%
  drop_na(diversity_group, AMRGeneFamily) %>%
  group_by(AMRGeneFamily) %>%
  filter(any(rpkm != 0)) # %>% ungroup() %>% distinct(sampleID)
  # group_by(AMRGeneFamily, diversity_group) %>% 
  # filter(n_distinct(rpkm) > 1) # Only keep samples with more than 2 unique values (helps remove the instance where all observations are 0 except 1)

# Perform stats to find any significant differences
div_card_family_tot_0_stats <-
  div_card_family_tot_0 %>%
  ungroup() %>% 
  drop_na() %>% 
  mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity", "Medium Diversity", "High Diversity"))) %>%
  group_by(AMRGeneFamily) %>%
  rstatix::kruskal_test(rpkm~diversity_group) %>%
  rstatix::adjust_pvalue(method = "BH") #%>%
  # filter(p <= 0.05)
  # filter(p.adj <= 0.05)

div_card_family_tot_0_mat <-
  card_heatmap_column_order %>% 
  as.data.frame() %>% 
  dplyr::rename('sampleID' = ".") %>% #ungroup() %>% distinct(sampleID)
  left_join(div_card_tot) %>%
  filter(grepl(DrugClass,  pattern = "glycopeptide|oxazolidinone") | sampleID %in% card_heatmap_column_order
         ) %>% 
  group_by(sampleID, AMRGeneFamily) %>%
  mutate(rpkm_mean = mean(rpkm)) %>%
  dplyr::slice_max(rpkm_mean, n = 1, with_ties = F) %>%
  ungroup() %>%
  mutate(rpkm = case_when(rpkm == 0 ~ "0",
                          between(rpkm,  0, 10) ~ "0 - 10",
                          between(rpkm, 10, 50) ~ "10 - 50",
                          between(rpkm, 50, 100) ~ "50 - 100",
                          TRUE ~ "100+"
         )) %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
  pivot_wider(id_cols = c(sampleID, diversity_group), names_from = "AMRGeneFamily", values_from = "rpkm") %>%
  # mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  mutate_if(is.character, ~replace(., is.na(.), "0")) %>%
  # select(where(~ any(. != "0"))) %>%
  pivot_longer(!c(sampleID, diversity_group), names_to = "AMRGeneFamily", values_to = "rpkm") %>%
  group_by(sampleID, AMRGeneFamily) %>%
  dplyr::slice_max(diversity_group, n = 1, with_ties = F) %>%
  ungroup() %>%
  right_join(div_card_family_tot_0_stats %>% select(AMRGeneFamily)) %>%
  pivot_wider(id_cols = sampleID, names_from = "AMRGeneFamily", values_from = "rpkm") %>%
  column_to_rownames(var = "sampleID")


# Create labels
div_card_family_heatmap_labels <-
  div_card_family_tot_0_mat %>%
  rownames_to_column(var = "sampleID") %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
  group_by(sampleID) %>%
  dplyr::slice_max(diversity_group, n = 1, with_ties = F) %>%
  right_join(div_card_family_tot_0_mat %>% rownames_to_column(var = "sampleID")) %>%
  select(diversity_group) %>%
  mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity", "Medium Diversity", "High Diversity"))) %>%
  pull(diversity_group)

# Create groupings of CARD classes
div_card_family_heatmap_groups <-
  div_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(div_card_family_tot_0_stats %>% select(AMRGeneFamily, p.adj)) %>%
  arrange(p.adj) %>%
  select(-p.adj) %>%
  left_join(card_dict %>% select(AMRGeneFamily, DrugClass)) %>%
  select(AMRGeneFamily, DrugClass) %>%
  mutate(DrugClass2 = case_when(grepl(DrugClass, pattern = "carbapenem|penam") ~ "Carbapenem",
                                DrugClass == "glycopeptide antibiotic" ~ "Vancomycin",
                                grepl(DrugClass, pattern = "oxazolidinone antibiotic") ~ "Linezolid",
                                grepl(DrugClass, pattern = "fluoroquinolone") ~ "Fluoroquinolone",
                                TRUE ~ "Multi-Drug"
                                )) %>%
  group_by(AMRGeneFamily) %>%
  dplyr::slice(1) %>%
  left_join(div_card_family_tot_0_stats) %>%
  arrange(p.adj)

# Create row annotation of CARD classes
div_card_family_heatmap_signif <-
  div_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(div_card_family_tot_0_stats) %>%
  arrange(p.adj) %>% 
  mutate(p.adj = case_when(p.adj > 0.05 ~ "NS",
                           p.adj <= 0.05 ~ "p < 0.05",
                           p.adj <= 0.01 ~ "p < 0.01",
                           p.adj <= 0.001 ~ "p < 0.0001"),
         p.adj = as.factor(p.adj),
         col = case_when(p.adj == "NS" ~ "gray80",
                         p.adj == "p < 0.05" ~ "#98e2fa",
                         p.adj == "p < 0.01" ~ "#44cffc",
                         p.adj == "p < 0.0001" ~ "#03befc")) %>%
  select(AMRGeneFamily, Significance = p.adj) %>%
  inner_join(card_dict %>%
              select(AMRGeneFamily) %>%
              group_by(AMRGeneFamily) %>%
              dplyr::slice(1)) %>%
  column_to_rownames(var = "AMRGeneFamily")

pcol <- as.vector(list(c("NS" = "gray80",
                         "p < 0.05" = "#98e2fa",
                         "p < 0.01" = "#44cffc",
                         "p < 0.0001" = "#03befc")))[[1]]


gg_div_card_family_heatmap <-
div_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(div_card_family_tot_0_stats %>% select(AMRGeneFamily, p.adj)) %>%
  arrange(p.adj) %>%
  select(-p.adj) %>%
  column_to_rownames(var = "AMRGeneFamily") %>% #nrow()
  as.matrix() %>% 
  Heatmap(
    name = "RPKM",
    col = c("0" = "#f0f4f5",
            "0 - 10" = "#edb4de",
            "10 - 50" = "#db8cc6",
            "50 - 100" = "#c96bb0",
            "100+" = "#9c3380"),
    rect_gp = gpar(col = "black", lwd = 0.2),
    column_names_gp = grid::gpar(fontsize = 8),
    column_gap = unit(6, "mm"),
    column_split = div_card_family_heatmap_labels,
    column_order = card_heatmap_column_order,
    column_title_gp = gpar(fontsize = 14),
    column_title_rot = 0,
    cluster_columns = FALSE,
    show_column_names = FALSE,
    show_column_dend = FALSE,
    row_names_gp = gpar(fontsize = 6),
    row_gap = unit(3.5, "mm"),
    row_names_side = c("left"),
    row_order = rownames(div_card_family_heatmap_signif),
    row_split = div_card_family_heatmap_groups$DrugClass2,
    row_title_rot = 0,
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    right_annotation = rowAnnotation(df = div_card_family_heatmap_signif,
                                     show_annotation_name = F,
                                     col = list(Significance = pcol)
                                     )
    # row_dend_width = unit(4, "in"),
    # heatmap_width =  unit(12, "in")
  )
gg_div_card_family_heatmap

pdf(file = "./Results/Diversity_CARD.pdf", height = 4, width = 12)
gg_div_card_family_heatmap
dev.off()
## quartz_off_screen 
##                 2

CARD Resistance Genes: Enterococcus Expansion

ecoc_card_tot <- card2 %>% left_join(card_dict)

# Put in order of increasing enterococcus sp
# Use qual metabolomic data order
ecoc_card_heatmap_column_order <- peri_matrix_all %>% 
  ungroup() %>%
  arrange(enterococcus_rel_abundance) %>% 
  distinct(sampleID) %>%
  pull(sampleID)

# AMRGeneFamily level
ecoc_card_family_tot_0 <-
  ecoc_card_tot %>%
  filter(sampleID %in% ecoc_card_heatmap_column_order,
         grepl(DrugClass,  pattern = "glycopeptide")) %>%
  group_by(sampleID, AMRGeneFamily) %>%
  mutate(rpkm_mean = mean(rpkm)) %>%
  dplyr::slice_max(rpkm_mean, n = 1, with_ties = F) %>%
  ungroup() %>%
  pivot_wider(id_cols = c(sampleID, rpkm_mean), names_from = "AMRGeneFamily", values_from = "rpkm", values_fill = 0) %>%
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  select(where(~ any(. != 0))) %>%
  pivot_longer(-c(sampleID, rpkm_mean), names_to = "AMRGeneFamily", values_to = "rpkm") %>%
  group_by(sampleID, AMRGeneFamily) %>%
  slice_max(rpkm, n = 1, with_ties = F) %>%
  ungroup() %>%
  right_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             # enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>%
  drop_na(domination, AMRGeneFamily) %>%
  group_by(AMRGeneFamily) %>%
  filter(any(rpkm != 0)) # %>% ungroup() %>% distinct(sampleID)
  # group_by(AMRGeneFamily, domination) %>% 
  # filter(n_distinct(rpkm) > 1) # Only keep samples with more than 2 unique values (helps remove the instance where all observations are 0 except 1)

# Perform stats to find any significant differences
ecoc_card_family_tot_0_stats <-
  ecoc_card_family_tot_0 %>%
  ungroup() %>% 
  drop_na() %>% 
  mutate(domination = factor(domination, levels = c("No Expansion", "Expansion"))) %>%
  group_by(AMRGeneFamily) %>%
  rstatix::wilcox_test(rpkm~domination) %>%
  rstatix::adjust_pvalue(method = "BH") #%>%
  # filter(p <= 0.05)
  # filter(p.adj <= 0.05)

ecoc_card_family_tot_0_mat <-
  ecoc_card_heatmap_column_order %>% 
  as.data.frame() %>% 
  dplyr::rename('sampleID' = ".") %>% #ungroup() %>% distinct(sampleID)
  left_join(ecoc_card_tot) %>%
  filter(grepl(DrugClass,  pattern = "glycopeptide|oxazolidinone") | sampleID %in% ecoc_card_heatmap_column_order
         ) %>% 
  group_by(sampleID, AMRGeneFamily) %>%
  mutate(rpkm_mean = mean(rpkm)) %>%
  dplyr::slice_max(rpkm_mean, n = 1, with_ties = F) %>%
  ungroup() %>%
  mutate(rpkm = case_when(rpkm == 0 ~ "0",
                          between(rpkm,  0, 10) ~ "0 - 10",
                          between(rpkm, 10, 50) ~ "10 - 50",
                          between(rpkm, 50, 100) ~ "50 - 100",
                          TRUE ~ "100+"
         )) %>%
  left_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             # enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  pivot_wider(id_cols = c(sampleID, domination), names_from = "AMRGeneFamily", values_from = "rpkm") %>%
  # mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  mutate_if(is.character, ~replace(., is.na(.), "0")) %>%
  # select(where(~ any(. != "0"))) %>%
  pivot_longer(!c(sampleID, domination), names_to = "AMRGeneFamily", values_to = "rpkm") %>%
  group_by(sampleID, AMRGeneFamily) %>%
  dplyr::slice_max(domination, n = 1, with_ties = F) %>%
  ungroup() %>%
  right_join(ecoc_card_family_tot_0_stats %>% select(AMRGeneFamily)) %>%
  pivot_wider(id_cols = sampleID, names_from = "AMRGeneFamily", values_from = "rpkm") %>%
  column_to_rownames(var = "sampleID")


# Create labels
ecoc_card_family_heatmap_labels <-
  ecoc_card_family_tot_0_mat %>%
  rownames_to_column(var = "sampleID") %>%
  left_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] #|
                                             # enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  group_by(sampleID) %>%
  dplyr::slice_max(domination, n = 1, with_ties = F) %>%
  right_join(ecoc_card_family_tot_0_mat %>% rownames_to_column(var = "sampleID")) %>%
  select(domination) %>%
  mutate(domination = factor(domination, levels = c("No Expansion", "Expansion"))) %>%
  pull(domination)

# Create groupings of CARD classes
ecoc_card_family_heatmap_groups <-
  ecoc_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(ecoc_card_family_tot_0_stats %>% select(AMRGeneFamily, p.adj)) %>%
  arrange(p.adj) %>%
  select(-p.adj) %>%
  left_join(card_dict %>% select(AMRGeneFamily, DrugClass)) %>%
  select(AMRGeneFamily, DrugClass) %>%
  mutate(DrugClass2 = case_when(grepl(DrugClass, pattern = "carbapenem|penam") ~ "Carbapenem",
                                DrugClass == "glycopeptide antibiotic" ~ "Vancomycin",
                                grepl(DrugClass, pattern = "oxazolidinone antibiotic") ~ "Linezolid",
                                grepl(DrugClass, pattern = "fluoroquinolone") ~ "Fluoroquinolone",
                                TRUE ~ "Multi-Drug"
                                )) %>%
  group_by(AMRGeneFamily) %>%
  dplyr::slice(1) %>%
  left_join(ecoc_card_family_tot_0_stats) %>%
  arrange(p.adj)

# Create row annotation of CARD classes
ecoc_card_family_heatmap_signif <-
  ecoc_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(ecoc_card_family_tot_0_stats) %>%
  arrange(p.adj) %>% 
  mutate(p.adj = case_when(p.adj > 0.05 ~ "NS",
                           p.adj <= 0.05 ~ "p < 0.05",
                           p.adj <= 0.01 ~ "p < 0.01",
                           p.adj <= 0.001 ~ "p < 0.0001"),
         p.adj = as.factor(p.adj),
         col = case_when(p.adj == "NS" ~ "gray80",
                         p.adj == "p < 0.05" ~ "#98e2fa",
                         p.adj == "p < 0.01" ~ "#44cffc",
                         p.adj == "p < 0.0001" ~ "#03befc")) %>%
  select(AMRGeneFamily, Significance = p.adj) %>%
  inner_join(card_dict %>%
              select(AMRGeneFamily) %>%
              group_by(AMRGeneFamily) %>%
              dplyr::slice(1)) %>%
  column_to_rownames(var = "AMRGeneFamily")

gg_ecoc_card_family_heatmap <-
ecoc_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(ecoc_card_family_tot_0_stats %>% select(AMRGeneFamily, p.adj)) %>%
  arrange(p.adj) %>%
  select(-p.adj) %>%
  column_to_rownames(var = "AMRGeneFamily") %>% #nrow()
  as.matrix() %>% 
  Heatmap(
    name = "RPKM",
    col = c("0" = "#f0f4f5",
            "0 - 10" = "#edb4de",
            "10 - 50" = "#db8cc6",
            "50 - 100" = "#c96bb0",
            "100+" = "#9c3380"),
    rect_gp = gpar(col = "black", lwd = 0.2),
    column_names_gp = grid::gpar(fontsize = 8),
    column_gap = unit(6, "mm"),
    column_split = ecoc_card_family_heatmap_labels,
    column_order = ecoc_card_heatmap_column_order,
    column_title_gp = gpar(fontsize = 14),
    column_title_rot = 0,
    cluster_columns = FALSE,
    show_column_names = FALSE,
    show_column_dend = FALSE,
    row_names_gp = gpar(fontsize = 6),
    row_gap = unit(3.5, "mm"),
    row_names_side = c("left"),
    row_order = rownames(ecoc_card_family_heatmap_signif),
    row_split = ecoc_card_family_heatmap_groups$DrugClass2,
    row_title_rot = 0,
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    right_annotation = rowAnnotation(df = ecoc_card_family_heatmap_signif,
                                     show_annotation_name = F,
                                     col = list(Significance = pcol)
                                     )
    # row_dend_width = unit(4, "in"),
    # heatmap_width =  unit(12, "in")
  )
gg_ecoc_card_family_heatmap

pdf(file = "./Results/Ecoc_Expan_CARD.pdf", height = 4, width = 12)
gg_ecoc_card_family_heatmap
dev.off()
## quartz_off_screen 
##                 2

CARD Resistance Genes: Enterobacterales Expansion

ebac_card_tot <- card2 %>% left_join(card_dict)

# Put in order of increasing enterobacterales sp
# Use qual metabolomic data order
ebac_card_heatmap_column_order <- peri_matrix_all %>% 
  ungroup() %>%
  arrange(enterobacterales_rel_abundance) %>% 
  distinct(sampleID) %>%
  pull(sampleID)

# AMRGeneFamily level
ebac_card_family_tot_0 <-
  ebac_card_tot %>%
  filter(sampleID %in% ebac_card_heatmap_column_order,
         grepl(DrugClass,  pattern = "carbapenem|fluoroquinolone"),
         !grepl(DrugClass, pattern = "oxazolidinone")) %>% 
  group_by(sampleID, AMRGeneFamily) %>%
  mutate(rpkm_mean = mean(rpkm)) %>%
  dplyr::slice_max(rpkm_mean, n = 1, with_ties = F) %>%
  ungroup() %>%
  pivot_wider(id_cols = c(sampleID, rpkm_mean), names_from = "AMRGeneFamily", values_from = "rpkm", values_fill = 0) %>%
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  select(where(~ any(. != 0))) %>%
  pivot_longer(-c(sampleID, rpkm_mean), names_to = "AMRGeneFamily", values_to = "rpkm") %>%
  group_by(sampleID, AMRGeneFamily) %>%
  slice_max(rpkm, n = 1, with_ties = F) %>%
  ungroup() %>%
  right_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>%
  drop_na(domination, AMRGeneFamily) %>%
  group_by(AMRGeneFamily) %>%
  filter(any(rpkm != 0)) # %>% ungroup() %>% distinct(sampleID)
  # group_by(AMRGeneFamily, domination) %>% 
  # filter(n_distinct(rpkm) > 1) # Only keep samples with more than 2 unique values (helps remove the instance where all observations are 0 except 1)

# Perform stats to find any significant differences
ebac_card_family_tot_0_stats <-
  ebac_card_family_tot_0 %>%
  ungroup() %>% 
  drop_na() %>% 
  mutate(domination = factor(domination, levels = c("No Expansion", "Expansion"))) %>%
  group_by(AMRGeneFamily) %>%
  rstatix::wilcox_test(rpkm~domination) %>%
  rstatix::adjust_pvalue(method = "BH") #%>%
  # filter(p <= 0.05)
  # filter(p.adj <= 0.05)

ebac_card_family_tot_0_mat <-
  ebac_card_heatmap_column_order %>% 
  as.data.frame() %>% 
  dplyr::rename('sampleID' = ".") %>% #ungroup() %>% distinct(sampleID)
  left_join(ebac_card_tot) %>%
  filter(grepl(DrugClass,  pattern = "glycopeptide") | sampleID %in% ebac_card_heatmap_column_order,
         !grepl(DrugClass, pattern = "oxazolidinone")) %>% 
  group_by(sampleID, AMRGeneFamily) %>%
  mutate(rpkm_mean = mean(rpkm)) %>%
  dplyr::slice_max(rpkm_mean, n = 1, with_ties = F) %>%
  ungroup() %>%
  mutate(rpkm = case_when(rpkm == 0 ~ "0",
                          between(rpkm,  0, 10) ~ "0 - 10",
                          between(rpkm, 10, 50) ~ "10 - 50",
                          between(rpkm, 50, 100) ~ "50 - 100",
                          TRUE ~ "100+"
         )) %>%
  left_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  pivot_wider(id_cols = c(sampleID, domination), names_from = "AMRGeneFamily", values_from = "rpkm") %>%
  # mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  mutate_if(is.character, ~replace(., is.na(.), "0")) %>%
  # select(where(~ any(. != "0"))) %>%
  pivot_longer(!c(sampleID, domination), names_to = "AMRGeneFamily", values_to = "rpkm") %>%
  group_by(sampleID, AMRGeneFamily) %>%
  dplyr::slice_max(domination, n = 1, with_ties = F) %>%
  ungroup() %>%
  right_join(ebac_card_family_tot_0_stats %>% select(AMRGeneFamily)) %>%
  pivot_wider(id_cols = sampleID, names_from = "AMRGeneFamily", values_from = "rpkm") %>%
  column_to_rownames(var = "sampleID")


# Create labels
ebac_card_family_heatmap_labels <-
  ebac_card_family_tot_0_mat %>%
  rownames_to_column(var = "sampleID") %>%
  left_join(peri_matrix_all %>% 
             mutate(domination = case_when(enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]
                                           ~ "Expansion",
                                          TRUE ~ "No Expansion")) %>% 
               select(sampleID, domination)) %>% 
  group_by(sampleID) %>%
  dplyr::slice_max(domination, n = 1, with_ties = F) %>%
  right_join(ebac_card_family_tot_0_mat %>% rownames_to_column(var = "sampleID")) %>%
  select(domination) %>%
  mutate(domination = factor(domination, levels = c("No Expansion", "Expansion"))) %>%
  pull(domination)

# Create groupings of CARD classes
ebac_card_family_heatmap_groups <-
  ebac_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(ebac_card_family_tot_0_stats %>% select(AMRGeneFamily, p.adj)) %>%
  arrange(p.adj) %>%
  select(-p.adj) %>%
  left_join(card_dict %>% select(AMRGeneFamily, DrugClass)) %>%
  select(AMRGeneFamily, DrugClass) %>%
  mutate(DrugClass2 = case_when(grepl(DrugClass, pattern = "carbapenem|penam") ~ "Carbapenem",
                                DrugClass == "glycopeptide antibiotic" ~ "Vancomycin",
                                # grepl(DrugClass, pattern = "oxazolidinone") ~ "Linezolid",
                                grepl(DrugClass, pattern = "fluoroquinolone") ~ "Fluoroquinolone",
                                TRUE ~ "Multi-Drug"
                                )) %>%
  group_by(AMRGeneFamily) %>%
  dplyr::slice(1) %>%
  left_join(ebac_card_family_tot_0_stats) %>%
  arrange(p.adj)

# Create row annotation of CARD classes
ebac_card_family_heatmap_signif <-
  ebac_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(ebac_card_family_tot_0_stats) %>%
  arrange(p.adj) %>% 
  mutate(p.adj = case_when(p.adj > 0.05 ~ "NS",
                           p.adj <= 0.05 ~ "p < 0.05",
                           p.adj <= 0.01 ~ "p < 0.01",
                           p.adj <= 0.001 ~ "p < 0.0001"),
         p.adj = as.factor(p.adj),
         col = case_when(p.adj == "NS" ~ "gray80",
                         p.adj == "p < 0.05" ~ "#98e2fa",
                         p.adj == "p < 0.01" ~ "#44cffc",
                         p.adj == "p < 0.0001" ~ "#03befc")) %>%
  select(AMRGeneFamily, Significance = p.adj) %>%
  inner_join(card_dict %>%
              select(AMRGeneFamily) %>%
              group_by(AMRGeneFamily) %>%
              dplyr::slice(1)) %>%
  column_to_rownames(var = "AMRGeneFamily")

gg_ebac_card_family_heatmap <-
ebac_card_family_tot_0_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column(var = "AMRGeneFamily") %>%
  left_join(ebac_card_family_tot_0_stats %>% select(AMRGeneFamily, p.adj)) %>%
  arrange(p.adj) %>%
  select(-p.adj) %>%
  column_to_rownames(var = "AMRGeneFamily") %>% #nrow()
  as.matrix() %>% 
  Heatmap(
    name = "RPKM",
    col = c("0" = "#f0f4f5",
            "0 - 10" = "#edb4de",
            "10 - 50" = "#db8cc6",
            "50 - 100" = "#c96bb0",
            "100+" = "#9c3380"),
    rect_gp = gpar(col = "black", lwd = 0.2),
    column_names_gp = grid::gpar(fontsize = 8),
    column_gap = unit(6, "mm"),
    column_split = ebac_card_family_heatmap_labels,
    column_order = ebac_card_heatmap_column_order,
    column_title_gp = gpar(fontsize = 14),
    column_title_rot = 0,
    cluster_columns = FALSE,
    show_column_names = FALSE,
    show_column_dend = FALSE,
    row_names_gp = gpar(fontsize = 6),
    row_gap = unit(3.5, "mm"),
    row_names_side = c("left"),
    row_order = rownames(ebac_card_family_heatmap_signif),
    row_split = ebac_card_family_heatmap_groups$DrugClass2,
    row_title_rot = 0,
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    right_annotation = rowAnnotation(df = ebac_card_family_heatmap_signif,
                                     show_annotation_name = F,
                                     col = list(Significance = pcol)
                                     )
    # row_dend_width = unit(4, "in"),
    # heatmap_width =  unit(12, "in")
  )
gg_ebac_card_family_heatmap

pdf(file = "./Results/Ebac_Expan_CARD.pdf", height = 6, width = 12)
gg_ebac_card_family_heatmap
dev.off()
## quartz_off_screen 
##                 2

CARD Resistance Genes: Any Infection

Supplemental Figure 1: Fecal Sample Collected from Transplant

sample_days <- metaphlan_df_sumry %>%
    group_by(sampleID) %>%
    slice(1) %>%
    filter(db == "Liver Transplant") %>%
    select(sampleID, diversity_group) %>%
    left_join(sample_lookup %>%
        select(sampleID, sample_days_from_transplant)) %>%
    ungroup()


# Quartiles of samples
clin_quartiles <- sample_days %>%
    select(sample_days_from_transplant) %>%
    summarise(minimum = min(sample_days_from_transplant, na.rm = T),
        lower = unname(quantile(sample_days_from_transplant,
            probs = 0.25, na.rm = T)), median = unname(quantile(sample_days_from_transplant,
            probs = 0.5, na.rm = T)), upper = unname(quantile(sample_days_from_transplant,
            probs = 0.75, na.rm = T)), maximum = max(sample_days_from_transplant,
            na.rm = T)) %>%
    mutate(placeholder = 1) %>%
    pivot_longer(!placeholder, names_to = "ranges", values_to = "value")



sample_days_histogram <- sample_days %>%
    ggplot(., aes(x = sample_days_from_transplant)) + geom_histogram(color = "black",
    fill = "grey", binwidth = 1) + geom_vline(aes(xintercept = 0),
    color = "orangered", linetype = "solid", size = 1.2) + geom_vline(aes(xintercept = median(sample_days_from_transplant,
    na.rm = TRUE)), color = "cyan2", linetype = "dashed", size = 1.2) +
    geom_text(aes(x = median(sample_days_from_transplant, na.rm = TRUE),
        y = 12.5), color = "cyan2", label = paste0("Median = ",
        clin_quartiles %>%
            filter(ranges == "median") %>%
            select(value), " days"), nudge_x = 4) + geom_text(label = "Transplant Date = Day 0",
    x = -4, y = 12.5, color = "orangered") + theme_bw() + theme(panel.grid = eb(),
    axis.text = et(color = "black", size = 12), axis.title = et(color = "black",
        size = 14), legend.position = "none") + scale_x_continuous(breaks = seq(-7,
    30, 5), limits = c(-7, 35)) + ylab("Number of Samples\n")


sample_days_boxplot <- sample_days %>%
    ggplot(., aes(x = sample_days_from_transplant, y = diversity_group,
        fill = diversity_group)) + geom_vline(aes(xintercept = 0),
    color = "orangered", linetype = "solid", size = 1.2) + geom_vline(aes(xintercept = median(sample_days_from_transplant,
    na.rm = TRUE)), color = "cyan2", linetype = "dashed", size = 1.2) +
    geom_violin() + geom_boxplot(alpha = 0.5, width = 0.15, outlier.shape = NA,
    fill = "white", color = "white") + stat_compare_means(comparisons = list(c("High Diversity",
    "Medium Diversity"), c("High Diversity", "Low Diversity"),
    c("Medium Diversity", "Low Diversity")), bracket.size = 0.3,
    step.increase = 0.07) + theme_bw() + theme(panel.grid = eb(),
    axis.text.x = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 12), axis.ticks.y = eb(), axis.title = et(color = "black",
        size = 14), legend.position = "none") + scale_x_continuous(breaks = seq(-7,
    30, 5), limits = c(-7, 35)) + scale_fill_manual(values = c("#3A001E",
    "#8A0246", "#C20463")) + ylab("Diversity Groups\n") + xlab("\nDays to Sample")

pdf(file = "./Results/Supplemental_Figure_1.pdf", height = 10,
    width = 12, onefile = F)

gg.stack(sample_days_histogram, sample_days_boxplot)

dev.off()
## quartz_off_screen 
##                 2
gg.stack(sample_days_histogram, sample_days_boxplot)

Miscellaneous Statistics

# Percent of samples with > 90% of a single taxon
metaphlan_df2 %>%
    left_join(metaphlan_df_sumry %>%
        select(sampleID, diversity_group)) %>%
    filter(grepl(x = sampleID, pattern = "^lt")) %>%
    group_by(sampleID) %>%
    filter(pctseqs > 0.9) %>%
    select(-y.text, -db) %>%
    write.csv(., "./Results/90percent_taxon.csv", row.names = FALSE)

Supplemental Figure 5B: Infections by Diversity Group

diversity_infx <- peri_criteria_all %>% 
  left_join(metaphlan_df_sumry %>% 
              mutate(patientID = str_extract(string = sampleID, pattern = "lt-[0-9]+")) %>% 
              select(patientID, diversity_group)) %>% 
  group_by(patientID, eday) %>%
  arrange(-infx_stool) %>%
  dplyr::slice(1) %>%
  ungroup() %>% 
  select(patientID, sampleID, diversity_group, bact_infection_present, infx_stool, organism1, micro1.factor) %>%
  distinct() %>% 
  mutate(organism1 = gsub(x = organism1, pattern = "\\s+", replacement = ""),
         organism1 = str_to_lower(string = organism1),
         organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"), organism1, "Culture Negative")) %>% 
  group_by(patientID, sampleID, infx_stool) %>%
  mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium", ignore.case = T),
         `Enterococcus faecalis` = grepl(x = organism1, pattern = "enterococcusfaecalis", ignore.case = T),
         `Enterococcus avium` = grepl(x = organism1, pattern = "enterococcusavium", ignore.case = T),
         `Enterococcus gallinarum` = grepl(x = organism1, pattern = "enterococcusgallinarum", ignore.case = T),
         `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae", ignore.case = T),
         `Enterobacter cloaceae` = grepl(x = organism1, pattern = "enterobactercloaceae", ignore.case = T),
         `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli", ignore.case = T),
         `Citrobacter freundii` = grepl(x = organism1, pattern = "citrobacterfreundii", ignore.case = T),
         `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis", ignore.case = T),
         `Staphylococcus aureus` = grepl(x = organism1, pattern = "staphylococcusaureus", ignore.case = T),
         `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis", ignore.case = T),
         `Pseudomonas aeruginosa` = grepl(x = organism1, pattern = "pseudomonasaeruginosa", ignore.case = T),
         `Stenotrophmonas maltophilia` = grepl(x = organism1, pattern = "stenotrophmonasmaltophilia", ignore.case = T),
         `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori", ignore.case = T),
         `Clostridium difficile` = grepl(x = organism1, pattern = "clostridiumdifficile|clostridioidesdifficile", ignore.case = T),
         `Bacteroides sp.` = grepl(x = organism1, pattern = "bacteroides", ignore.case = T),
         `Culture Negative` = grepl(x = organism1, pattern = "Culture Negative", ignore.case = T)) %>%  
  pivot_longer(-c(patientID:micro1.factor), names_to = "organisms", values_to = "org_presence") %>%
  mutate(organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection", organisms),
         org_presence = ifelse(org_presence == TRUE, 1, 0)) %>% 
  group_by(sampleID, infx_stool, bact_infection_present, organisms) %>% 
  dplyr::slice_max(org_presence) %>% 
  ungroup() %>% 
  filter(org_presence == 1) %>% 
  group_by(patientID, sampleID, organisms, org_presence) %>% 
  dplyr::slice(1) %>%
  mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID)) %>% 
  mutate(org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present, ignore.case = T), 0, org_presence)) %>% 
  ungroup() %>% 
  mutate(organisms = ifelse(bact_infection_present == "Yes" & org_presence == 0, "Other", organisms)) %>% 
  mutate(organisms = ifelse(grepl(x = organisms, pattern = "enterococcus|klebsiella|escherichia|proteus|citrobacter|culture", ignore.case = TRUE), organisms, "Other Bacterial Infection"),
         organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection", organisms),
         diversity_group = as.character(diversity_group),
         diversity_group = factor(diversity_group, levels = c("Low Diversity", "Medium Diversity", "High Diversity")),
         org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present, ignore.case = T), 0, 1))

diversity_infx_stats <-
  diversity_infx %>% 
  ungroup() %>% 
  group_by(diversity_group) %>%
  summarise(n = nrow(.),
            org_presence_1 = sum(org_presence == 1),
            org_presence_0 = n - org_presence_1) %>% 
  select(-n) %>% 
  column_to_rownames(var = "diversity_group") %>% 
  # t() %>% 
  rstatix::pairwise_prop_test(p.adjust.method = "BH") %>% 
  mutate(p = ifelse(p < 0.001, "p < 0.001", paste("p = ", round(p, 2))),
         p.adj = ifelse(p.adj < 0.001, "p.adj < 0.001", paste("p.adj = ", round(p.adj, 2))))


gg_diversity_infx <-
diversity_infx %>% 
  mutate(diversity_group = as.character(diversity_group),
         diversity_group = ifelse(is.na(diversity_group), "No Stool Sample", diversity_group),
         diversity_group = factor(diversity_group, levels = c("Low Diversity", "Medium Diversity", "High Diversity", "No Stool Sample")),
         organisms = ifelse(grepl(organisms, pattern = "gallinarum"), "Other Bacterial Infection", organisms),
         org_colors = case_when(grepl(x = organisms, pattern = "enterococcus faecium", ignore.case = T) ~ 1,
                                grepl(x = organisms, pattern = "enterococcus faecalis", ignore.case = T) ~ 2,
                                grepl(x = organisms, pattern = "enterococcus avium", ignore.case = T) ~ 3,
                                # grepl(x = organisms, pattern = "enterococcus gallinarum", ignore.case = T) ~ 3,
                                grepl(x = organisms, pattern = "klebsiella pneumoniae", ignore.case = T) ~ 4,
                                grepl(x = organisms, pattern = "escherichia coli", ignore.case = T) ~ 5,
                                grepl(x = organisms, pattern = "proteus mirabilis", ignore.case = T) ~ 6,
                                grepl(x = organisms, pattern = "citrobacter freundii", ignore.case = T) ~ 7,
                                grepl(x = organisms, pattern = "other bacterial infection|gallinarum", ignore.case = T) ~ 8,
                                grepl(x = organisms, pattern = "culture negative", ignore.case = T) ~ 9,
                                grepl(x = diversity_group, pattern = "no stool sample", ignore.case = T) ~ 10,
                                TRUE ~ 0),
         organisms = as.factor(organisms),
         organisms = factor(organisms, levels = c("Enterococcus faecium",
                                                  "Enterococcus faecalis",
                                                  "Enterococcus avium",
                                                  "Klebsiella pneumoniae",
                                                  "Escherichia coli",
                                                  "Proteus mirabilis",
                                                  "Citrobacter freundii",
                                                  "Other Bacterial Infection",
                                                  "Culture Negative",
                                                  "No Bacterial Infection",
                                                  "No Stool Sample")),
         org_colors = factor(org_colors, levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "0"))) %>% 
  filter(org_colors != "0") %>% 
ggplot(aes(x = diversity_group, y = org_presence, fill = as.factor(org_colors))) +
  geom_col(color = "black", size = 0.3) +
  theme_bw() +
  theme(
    axis.text = et(size = 12, color = "black"),
    axis.title.x = eb(),
    axis.title.y = et(size = 14, color = "black"),
    panel.grid = eb(),
    panel.border = eb(),
    axis.line = el(color = "black")
  ) +
  geom_table_npc(data = diversity_infx_stats %>% 
                          select(` ` = group1, `  ` = group2, p.adj) %>%
                          mutate(p.adj = gsub(pattern = "p.adj = ", replacement = "", p.adj)), 
                 label = list(diversity_infx_stats %>% 
                          select(` ` = group1, `  ` = group2, p.adj) %>%
                          mutate(p.adj = gsub(pattern = "p.adj = ", replacement = "", p.adj))),
                 npcx = 0.975, npcy = 0.975#,
                 # table.theme = ttheme_gtlight
                 ) +
  scale_fill_manual(values = c("#129246",
                               "#0C7A3A",
                               "#055C2B",
                               "#FF0000",
                               "#CC0404",
                               "#8A0202",
                               "#5C0202",
                               "#E6C66E",
                               "#BD992D",
                               "gray85"),
                    labels = c("Enterococcus faecium",
                               "Enterococcus faecalis",
                               "Enterococcus avium",
                               "Klebsiella pneumoniae",
                               "Escherichia coli",
                               "Proteus mirabilis",
                               "Citrobacter freundii",
                               "Other Bacterial Infection",
                               "Culture Negative",
                               "No Stool Sample")
                    ) +
    labs(fill = "Infecting Organism",
       y = "Number of Infections\n(Including Reinfections)") +
  scale_y_continuous(expand = expansion(add = c(0.25)))
gg_diversity_infx

ggsave(plot = gg_diversity_infx,
            filename = "./Results/Supplemental_Figure_5B.pdf",
            height = 6,
            width = 8)

Validation Cohort

Import data

# First samples lookup
vc_first_samps <- readRDS("./Data/validation_cohort_first_samps.rds")

# All sample lookup
vc_all_samps <- readRDS("./Data/validation_cohort_all_samps.rds")

# All samples
vc_peri_criteria_all <- readRDS("./Data/validation_cohort_peri_criteria_all_anon.rds")

# Antibiotics data
vc_abx <- readRDS("./Data/validation_cohort_abx.rds")

# Demographics data
vc_demo <- readRDS("./Data/validation_cohort_demo.rds") %>%
    mutate(race = recode(race, `Black or African-American` = "Black/African-American",
        `Unknown or Patient unable to respond` = "Unknown"))

# Metagenomic
vc_metaphlan_df <- readRDS("./Data/validation_cohort_metaphlan.rds")

# Peri-transplant samples
vc_metaphlan_peri <- readRDS("./Data/validation_cohort_metahplan_peri.rds")

# Qualitative metabolomics
vc_metab_qual <- readRDS("./Data/validation_cohort_metab_qual.rds")

Diversity Groups

vc_diversity_thresh <- metaphlan_df_sumry %>%
    filter(diversity_group != "Healthy Donor") %>%
    select(diversity_group, Shannon) %>%
    group_by(diversity_group) %>%
    dplyr::slice_max(Shannon, with_ties = FALSE)

vc_t_metaphlan <- vc_metaphlan_df %>%
    right_join(vc_first_samps %>%
        mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01",
            `vc-012-02` = "vc-012-01"))) %>%
    mutate(db = "Liver Transplant") %>%
    select(sampleID, taxid, db, pctseqs, Total) %>%
    group_by(sampleID, taxid, pctseqs) %>%
    slice(1) %>%
    ungroup() %>%
    filter(pctseqs >= 1e-04) %>%
    group_by(sampleID) %>%
    dplyr::add_count(taxid, name = "totalSp") %>%
    mutate(sampleID_count = length(unique(sampleID)), spPres = totalSp/sampleID_count) %>%
    filter(spPres >= 0.1) %>%
    select(-c(Total, sampleID_count, spPres, totalSp)) %>%
    group_by(sampleID) %>%
    mutate(pctseqs = pctseqs/sum(pctseqs))


vc_t_metaphlan_mat <- vc_t_metaphlan %>%
    distinct() %>%
    pivot_wider(names_from = "taxid", values_from = "pctseqs",
        values_fill = 0) %>%
    column_to_rownames(var = "sampleID") %>%
    select(-db)

# Alpha Diversity matrix: Shannon
vc_alpha_shannon <- vegan::diversity(vc_t_metaphlan_mat, index = "shannon") %>%
    as.data.frame() %>%
    rownames_to_column(var = "sampleID") %>%
    dplyr::rename(Shannon = ".")

#### Plot Metaphlan Relative Abundance #### MetaPhlAn4
#### Taxonomy

vc_metaphlan_df2 <- vc_t_metaphlan %>%
    mutate(db = factor(db, levels = c("Liver Transplant"))) %>%
    left_join(tax_lookup) %>%
    drop_na(taxid) %>%
    arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = paste0(Phylum, "-", Order, "-", Family, "-",
        Genus)) %>%
    left_join(vc_alpha_shannon) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    mutate(cum.pct = cumsum(pctseqs), y.text = (cum.pct + c(0,
        cum.pct[-length(cum.pct)]))/2) %>%
    ungroup() %>%
    dplyr::select(-cum.pct)

vc_metaphlan_pal <- getRdpPal(vc_metaphlan_df2)

gg_vc_metaphlan <- vc_metaphlan_df2 %>%
    right_join(vc_first_samps %>%
        mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01",
            `vc-012-02` = "vc-012-01"))) %>%
    mutate(diversity_group = case_when(Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Low Diversity"] ~ "Low Diversity", Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Low Diversity"] & Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Medium Diversity"] ~ "Medium Diversity", Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Medium Diversity"] ~ "High Diversity")) %>%
    drop_na(diversity_group) %>%
    mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity",
        "Medium Diversity", "High Diversity"))) %>%
    ungroup() %>%
    mutate(Genus = factor(Genus, levels = unique(Genus))) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    ggplot(aes(x = reorder(sampleID, Shannon), y = pctseqs)) +
    geom_bar(stat = "identity", aes(fill = Genus), width = 0.9) +
    scale_fill_manual(values = vc_metaphlan_pal) + theme_bw() +
    theme(legend.position = "none", axis.text.x = eb(), axis.ticks.x = eb(),
        strip.text.x = et(angle = 0, size = 14), strip.background = eb(),
        axis.title.y = et(color = "black", size = 14), axis.text.y = et(color = "black",
            size = 12), panel.spacing = unit(0.5, "lines"), plot.margin = margin(t = 5,
            r = 5, b = 0, l = 5)) + facet_grid(. ~ diversity_group,
    scales = "free", space = "free") + scale_y_continuous(expand = expansion(mult = c(0.005,
    0.005)), labels = scales::percent_format(accuracy = 1)) +
    ylab("MetaPhlAn4 Relative Abundance") + xlab("")


# Color facets
gg_vc_metaphlan_grob <- ggplot_gtable(ggplot_build(gg_vc_metaphlan))
vc_strip_both <- which(grepl("strip-", gg_vc_metaphlan_grob$layout$name))
vc_fills <- diversity_group_colors[c(1:3)]
vc_k <- 1
for (i in vc_strip_both) {
    vc_l <- which(grepl("titleGrob", gg_vc_metaphlan_grob$grobs[[i]]$grobs[[1]]$childrenOrder))
    gg_vc_metaphlan_grob$grobs[[i]]$grobs[[1]]$children[[vc_l]]$children[[1]]$gp$col <- fills[vc_k]
    vc_k <- vc_k + 1
}

gg_vc_shannon <- vc_alpha_shannon %>%
    mutate(diversity_group = case_when(Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Low Diversity"] ~ "Low Diversity", Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Low Diversity"] & Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Medium Diversity"] ~ "Medium Diversity", Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group ==
        "Medium Diversity"] ~ "High Diversity")) %>%
    drop_na(diversity_group) %>%
    mutate(diversity_group = factor(diversity_group, levels = c("Low Diversity",
        "Medium Diversity", "High Diversity"))) %>%
    ggplot(aes(x = reorder(sampleID, Shannon), y = Shannon)) +
    geom_bar(stat = "identity", aes(fill = diversity_group),
        width = 0.9) + theme_bw() + theme(legend.position = "none",
    axis.text.x = eb(), axis.title.x = eb(), axis.ticks.x = eb(),
    strip.text = eb(), strip.background = er(fill = "white"),
    axis.title.y = et(color = "black", size = 14), axis.text.y = et(color = "black",
        size = 12), panel.spacing = unit(0.5, "lines"), plot.margin = margin(t = 0,
        r = 5, b = 0, l = 5), panel.grid = eb()) + scale_fill_manual(values = diversity_group_colors[c(1:3)]) +
    facet_grid(. ~ diversity_group, scales = "free", space = "free")

gg_vc_metaphlan_shannon <- plot_grid(gg_vc_metaphlan_grob, gg_vc_shannon,
    axis = "lb", align = "hv", nrow = 2, rel_heights = c(1, 0.15))

pdf(file = "./Results/VC_Metaphlan_Diversity.pdf", width = 12.25,
    height = 8)
gg_vc_metaphlan_shannon
invisible(dev.off())

sPLS-DA Diversity Groups

# Re-train/tune original model on ALL original data:
# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
tot_diversity_train_splsda <- mixOmics::splsda(diversity_metab_mat, diversity_metab_labs, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
tot_diversity_train_plsda_perf <-
  perf(
    tot_diversity_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  tot_diversity_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 4 might be best for classification error rate and max.dist

# Number of optimal variables to select for each component
tot_diversity_train_keepX <- c(1:10,  seq(20, 130, 10))

set.seed(123)
tot_diversity_train_tune_splsda <-
  mixOmics::tune.splsda(
    diversity_metab_mat,
    diversity_metab_labs,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = tot_diversity_train_keepX,
    nrepeat = 50
  )

plot(tot_diversity_train_tune_splsda, col = color.jet(4))

tot_diversity_error <- tot_diversity_train_tune_splsda$error.rate
tot_diversity_ncomp <- tot_diversity_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
tot_diversity_ncomp #1 component is optimal
## [1] 1
tot_diversity_select_keepX <- tot_diversity_train_tune_splsda$choice.keepX[1:ifelse(tot_diversity_ncomp == 1, tot_diversity_ncomp + 1, tot_diversity_ncomp)]  # optimal number of variables to select per component
tot_diversity_select_keepX
## comp1 comp2 
##    70     1
# Final Model
tot_diversity_splsda_final <-
  mixOmics::splsda(diversity_metab_mat, diversity_metab_labs, ncomp = ifelse(tot_diversity_ncomp == 1, tot_diversity_ncomp + 1, tot_diversity_ncomp), keepX = tot_diversity_select_keepX)

# Needed for downstream excel file for component loadings
tot_diversity_splsda_final_copy <- tot_diversity_splsda_final

# Build data matrix
vc_diversity_metab_mat <-
  vc_metab_qual %>%
  ungroup() %>% 
  select(patientID, sampleID, compound, mvalue) %>% 
  inner_join(vc_first_samps %>% mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01"))) %>% 
  mutate(compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  ungroup() %>%
  left_join(
    vc_metaphlan_df2 %>%
      right_join(vc_first_samps %>% mutate(
        sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01")
      )) %>%
      mutate(
        diversity_group = case_when(
          Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Low Diversity"] ~
            "Low Diversity",
          Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Low Diversity"] &
            Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Medium Diversity"] ~
            "Medium Diversity",
          Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Medium Diversity"] ~
            "High Diversity"
        ),
        diversity_group_abv = recode(diversity_group, `Low Diversity` = "Low", `Medium Diversity` = "Medium", `High Diversity` = "High")
      ) %>%
      drop_na(diversity_group, diversity_group_abv) %>%
      mutate(
        # diversity_group_abv = factor(diversity_group_abv, levels = c("Low", "Medium", "High")),
        diversity_group = factor(
        diversity_group,
        levels = c("Low Diversity", "Medium Diversity", "High Diversity")
      )) %>% 
      ungroup() %>% 
      distinct(sampleID, diversity_group_abv)
  ) %>% 
  group_by(sampleID, compound, diversity_group_abv) %>% 
  # summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>% 
  mutate_all(~replace(., is.nan(.), NA)) %>%
  select(sampleID, compound, mvalue, diversity_group_abv) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  # mutate_all(~replace(., is.na(.), 0)) %>% 
  filter(sampleID != "") %>%
  column_to_rownames(var = "sampleID") %>% 
  select(-diversity_group_abv) %>%
  select(names(diversity_metab_mat)) %>% 
  filter_all(any_vars(!is.na(.)))

# Build labels matrix
vc_diversity_metab_labs <-
  vc_diversity_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(
    vc_metaphlan_df2 %>%
      right_join(vc_first_samps %>% mutate(
        sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01")
      )) %>%
      mutate(
        diversity_group = case_when(
          Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Low Diversity"] ~
            "Low Diversity",
          Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Low Diversity"] &
            Shannon <= vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Medium Diversity"] ~
            "Medium Diversity",
          Shannon > vc_diversity_thresh$Shannon[vc_diversity_thresh$diversity_group == "Medium Diversity"] ~
            "High Diversity"
        ),
        diversity_group_abv = recode(diversity_group, `Low Diversity` = "Low", `Medium Diversity` = "Medium", `High Diversity` = "High")
      ) %>%
      drop_na(diversity_group, diversity_group_abv) %>%
      mutate(
        diversity_group_abv = factor(diversity_group_abv, levels = c("Low", "Medium", "High")),
        # diversity_group = factor(
        # diversity_group,
        # levels = c("Low Diversity", "Medium Diversity", "High Diversity")
      # )
    ) %>%
      ungroup() %>% 
      distinct(sampleID, diversity_group_abv)
  ) %>% 
  droplevels() %>% 
  pull(diversity_group_abv)

dim(vc_diversity_metab_mat) #35 93 (means 93 compounds and 35 LT patients)
## [1] 35 93
length(vc_diversity_metab_labs) #35 (means 35 LT patients/infections)
## [1] 35
# Predict using model
vc_diversity_tot <- predict(tot_diversity_splsda_final, 
                            vc_diversity_metab_mat,
                            dist = "all")

vc_diversity_tot_predict <- factor(vc_diversity_tot$class$max.dist[,tot_diversity_ncomp], levels = c("Low", "Medium", "High"))

vc_diversity_tot_union <- factor(union(vc_diversity_tot_predict, vc_diversity_metab_labs), levels = c("Low", "Medium", "High"))

vc_diversity_cm <-
  confusionMatrix(table(
    # factor(
      vc_diversity_tot_predict,
      # vc_diversity_tot_union),
    # factor(
      vc_diversity_metab_labs#,
      # vc_diversity_tot_union
    # )
  ))

vc_diversity_cm
## Confusion Matrix and Statistics
## 
##                         vc_diversity_metab_labs
## vc_diversity_tot_predict Low Medium High
##                   Low     16     10    2
##                   Medium   0      2    1
##                   High     0      1    3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.4211, 0.7613)
##     No Information Rate : 0.4571          
##     P-Value [Acc > NIR] : 0.063603        
##                                           
##                   Kappa : 0.3137          
##                                           
##  Mcnemar's Test P-Value : 0.007383        
## 
## Statistics by Class:
## 
##                      Class: Low Class: Medium Class: High
## Sensitivity              1.0000       0.15385     0.50000
## Specificity              0.3684       0.95455     0.96552
## Pos Pred Value           0.5714       0.66667     0.75000
## Neg Pred Value           1.0000       0.65625     0.90323
## Prevalence               0.4571       0.37143     0.17143
## Detection Rate           0.4571       0.05714     0.08571
## Detection Prevalence     0.8000       0.08571     0.11429
## Balanced Accuracy        0.6842       0.55420     0.73276
# Additional model measures
vc_diversity_epi <- mltest::ml_test(predicted = factor(vc_diversity_tot_predict, levels = c("Low", "Medium", "High")),
                                    true = factor(vc_diversity_metab_labs, levels = c("Low", "Medium", "High")))

vc_diversity_cm_names <- vc_diversity_cm$table

colnames(vc_diversity_cm_names) <- c("Actual\nLow", "Actual\nMedium", "Actual\nHigh")
rownames(vc_diversity_cm_names) <- c("Predicted\nLow", "Predicted\nMedium", "Predicted\nHigh")

vc_diversity_confusion_df <- vc_diversity_cm_names %>% 
  t() 

vc_diversity_confusion_df
##                        vc_diversity_tot_predict
## vc_diversity_metab_labs Predicted\nLow Predicted\nMedium Predicted\nHigh
##          Actual\nLow                16                 0               0
##          Actual\nMedium             10                 2               1
##          Actual\nHigh                2                 1               3
# multiclass 95% CI
vc_diversity_mc <- 
biostatUtil::multiClassCM(
  factor(vc_diversity_metab_labs, levels = c("Low", "Medium", "High")),
  factor(vc_diversity_tot_predict, levels = c("Low", "Medium", "High")),
  seed = 20,
  num.boot = 1000,
  conf.level = 0.95,
  digits = 2,
  method = "wilson"
)

vc_diversity_mc_table <- vc_diversity_mc$table %>% 
  as.data.frame() %>% 
  separate(., Low, into = c("X1", "X2", "X3", "X4"), sep = " ") %>% 
  select(Average, "Low_Avg" = X1, "Low_Lower" = X2, "Low_Upper" = X4, Medium, High) %>% 
  separate(., Medium, into = c("X1", "X2", "X3", "X4"), sep = " ") %>% 
  select(Average, Low_Avg, Low_Lower, Low_Upper, "Med_Avg" = X1, "Med_Lower" = X2, "Med_Upper" = X4, High) %>%
  separate(., High, into = c("X1", "X2", "X3", "X4"), sep = " ") %>% 
  select(Average, Low_Avg, Low_Lower, Low_Upper, Med_Avg, Med_Lower, Med_Upper, "High_Avg" = X1, "High_Lower" = X2, "High_Upper" = X4) %>% 
  mutate_all(funs(str_replace(., "\\(|\\)", ""))) %>% 
  mutate_if(is.character,as.numeric)

sPLS-DA Enterococcus Expansion

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
tot_ecoc_doms_train_splsda <- mixOmics::splsda(ecoc_doms_metab_mat, ecoc_doms_metab_labs, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(1234)
tot_ecoc_doms_train_plsda_perf <-
  perf(
    tot_ecoc_doms_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  tot_ecoc_doms_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 1 or 4 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
tot_ecoc_doms_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
tot_ecoc_doms_train_tune_splsda <-
  mixOmics::tune.splsda(
    ecoc_doms_metab_mat,
    ecoc_doms_metab_labs,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = tot_ecoc_doms_train_keepX,
    nrepeat = 50
  )

plot(tot_ecoc_doms_train_tune_splsda, col = color.jet(4))

tot_ecoc_doms_train_error <- tot_ecoc_doms_train_tune_splsda$error.rate
tot_ecoc_doms_train_ncomp <- tot_ecoc_doms_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
# tot_ecoc)doms_train_ncomp = 4 #4 components are optimal via visual inspection

tot_ecoc_doms_train_select_keepX <- tot_ecoc_doms_train_tune_splsda$choice.keepX[1:ifelse(tot_ecoc_doms_train_ncomp == 1, tot_ecoc_doms_train_ncomp + 1, tot_ecoc_doms_train_ncomp)]  # optimal number of variables to select per component
tot_ecoc_doms_train_select_keepX
## comp1 comp2 
##    90     3
# Final Model
tot_ecoc_doms_train_splsda_final <-
  mixOmics::splsda(ecoc_doms_metab_mat, ecoc_doms_metab_labs, ncomp = ifelse(tot_ecoc_doms_train_ncomp == 1, tot_ecoc_doms_train_ncomp + 1, tot_ecoc_doms_train_ncomp), keepX = tot_ecoc_doms_train_select_keepX)

# Needed for downstream excel file for component loadings
tot_ecoc_doms_train_splsda_final_copy <- tot_ecoc_doms_train_splsda_final

vc_ecoc <- vc_metaphlan_df2 %>%
      right_join(vc_first_samps %>% mutate(
        sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01")
      )) %>%
      group_by(sampleID) %>% 
      filter(grepl(x = Genus, pattern = "Enterococcus", ignore.case = TRUE)) %>% 
      count(sampleID, wt = pctseqs, name = "enterococcus_rel_abundance") %>% 
  full_join(vc_first_samps %>% ungroup() %>% mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01")) %>% select(sampleID)) %>% 
  mutate(enterococcus_rel_abundance = ifelse(is.na(enterococcus_rel_abundance), 0, enterococcus_rel_abundance),
         domination = case_when(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2] ~ "Expansion",
                                TRUE ~ "No Expansion")) %>%
      drop_na(domination) %>%
      ungroup() %>% 
      distinct(sampleID, domination)
  
  
vc_ecoc_doms_metab_mat <-
 vc_metab_qual %>% 
  ungroup() %>% 
  select(patientID, sampleID, compound, mvalue) %>% 
  inner_join(vc_first_samps %>% mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01"))) %>% 
  mutate(compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  ungroup() %>%
  left_join(vc_ecoc) %>% 
  group_by(sampleID, compound, domination) %>% 
  # summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>% 
  mutate_all(~replace(., is.nan(.), NA)) %>%
  select(sampleID, compound, mvalue, domination) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  # mutate_all(~replace(., is.na(.), 0)) %>% 
  filter(sampleID != "") %>%
  filter_all(any_vars(!is.na(.))) %>% 
  select(-domination) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(names(ecoc_doms_metab_mat) %>% as.data.frame() %>% dplyr::rename(compound = ".") %>% mutate(compound = recode(compound, `Omega-Muricholic Acid` = "3-Epicholic Acid Or Omega-Muricholic Acid")) %>% pull(compound)) %>% 
  filter_all(any_vars(!is.na(.)))

  
vc_ecoc_doms_metab_labs <-
  vc_ecoc_doms_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(vc_ecoc) %>% 
  pull(domination)

dim(vc_ecoc_doms_metab_mat) #35 93 (means 93 compounds and 35 LT patients)
## [1] 35 93
length(vc_ecoc_doms_metab_labs) #35 (means 35 LT patients)
## [1] 35
# Model metrics for all samples
vc_ecoc_doms_tot <- predict(tot_ecoc_doms_train_splsda_final,
                        vc_ecoc_doms_metab_mat,
                        dist = "all")

vc_ecoc_doms_tot_predict <- vc_ecoc_doms_tot$class$mahalanobis.dist[,tot_ecoc_doms_train_ncomp]

vc_ecoc_doms_tot_union <- union(vc_ecoc_doms_tot_predict, vc_ecoc_doms_metab_labs)

vc_ecoc_doms_cm <- confusionMatrix(table(factor(vc_ecoc_doms_tot_predict, vc_ecoc_doms_tot_union,
                                        levels = c("Expansion","No Expansion")),
                      factor(vc_ecoc_doms_metab_labs, vc_ecoc_doms_tot_union,
                             levels = c("Expansion","No Expansion"))),
                      positive = "Expansion")
vc_ecoc_doms_cm
## Confusion Matrix and Statistics
## 
##               
##                Expansion No Expansion
##   Expansion           10            5
##   No Expansion         2           18
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.6306, 0.9156)
##     No Information Rate : 0.6571          
##     P-Value [Acc > NIR] : 0.05024         
##                                           
##                   Kappa : 0.5812          
##                                           
##  Mcnemar's Test P-Value : 0.44969         
##                                           
##             Sensitivity : 0.8333          
##             Specificity : 0.7826          
##          Pos Pred Value : 0.6667          
##          Neg Pred Value : 0.9000          
##              Prevalence : 0.3429          
##          Detection Rate : 0.2857          
##    Detection Prevalence : 0.4286          
##       Balanced Accuracy : 0.8080          
##                                           
##        'Positive' Class : Expansion       
## 
# Additional model measures
vc_ecoc_doms_epi <- epiR::epi.tests(table(vc_ecoc_doms_tot_predict, vc_ecoc_doms_metab_labs), conf.level = 0.95)

vc_ecoc_doms_confusion_df <- vc_ecoc_doms_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nExpansion",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Expansion",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nExpansion" = "Test +",
                "Predicted\nNo Expansion" = "Test -") %>% 
  column_to_rownames(var = "actual")

vc_ecoc_doms_confusion_df
# odds ratio
vc_ecoc_doms_epi$detail[2][6,]
## [1] 18

sPLS-DA Enterobacterales Expansion

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
tot_ebac_doms_train_splsda <- mixOmics::splsda(ebac_doms_metab_mat, ebac_doms_metab_labs, ncomp = 5)

# Performance assessment
## 5-fold, 50-repeat cross validation
set.seed(3456)
tot_ebac_doms_train_plsda_perf <-
  perf(
    tot_ebac_doms_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  tot_ebac_doms_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 2 seems best for classification error rate and max.dist

# Number of optimal variables to select for each component
tot_ebac_doms_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
tot_ebac_doms_train_tune_splsda <-
  mixOmics::tune.splsda(
    ebac_doms_metab_mat,
    ebac_doms_metab_labs,
    ncomp = 3, # Choose 3 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = tot_ebac_doms_train_keepX,
    nrepeat = 50
  )

plot(tot_ebac_doms_train_tune_splsda, col = color.jet(3))

tot_ebac_doms_train_error <- tot_ebac_doms_train_tune_splsda$error.rate
tot_ebac_doms_train_ncomp <- tot_ebac_doms_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
# tot_ebac_doms_train_ncomp = 2 #4 components are optimal via visual inspection

tot_ebac_doms_train_select_keepX <- tot_ebac_doms_train_tune_splsda$choice.keepX[1:ifelse(tot_ebac_doms_train_ncomp == 1, tot_ebac_doms_train_ncomp + 1, tot_ebac_doms_train_ncomp)]  # optimal number of variables to select per component
tot_ebac_doms_train_select_keepX
## comp1 comp2 
##     1     1
# Final Model
tot_ebac_doms_train_splsda_final <-
  mixOmics::splsda(ebac_doms_metab_mat, ebac_doms_metab_labs, ncomp = ifelse(tot_ebac_doms_train_ncomp == 1, tot_ebac_doms_train_ncomp + 1, tot_ebac_doms_train_ncomp), keepX = tot_ebac_doms_train_select_keepX)

# Needed for downstream excel file for component loadings
tot_ebac_doms_train_splsda_final_copy <- tot_ebac_doms_train_splsda_final

vc_ebac <- vc_metaphlan_df2 %>%
      right_join(vc_first_samps %>% mutate(
        sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01")
      )) %>%
      group_by(sampleID) %>% 
      filter(grepl(x = Genus, pattern = "Enterobacterales", ignore.case = TRUE)) %>% 
      count(sampleID, wt = pctseqs, name = "enterobacterales_rel_abundance") %>% 
  full_join(vc_first_samps %>% ungroup() %>% mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01")) %>% select(sampleID)) %>% 
  mutate(enterobacterales_rel_abundance = ifelse(is.na(enterobacterales_rel_abundance), 0, enterobacterales_rel_abundance),
         domination = case_when(enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1] ~ "Expansion",
                                TRUE ~ "No Expansion")) %>%
      drop_na(domination) %>%
      ungroup() %>% 
      distinct(sampleID, domination)
  
  
vc_ebac_doms_metab_mat <-
  vc_metab_qual %>% 
  ungroup() %>% 
  select(patientID, sampleID, compound, mvalue#, 
        
         ) %>% 
  inner_join(vc_first_samps %>% mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01"))) %>% 
  mutate(compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  ungroup() %>%
  left_join(vc_ebac) %>% 
  group_by(sampleID, compound, domination) %>% 
  # summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>% 
  mutate_all(~replace(., is.nan(.), NA)) %>%
  select(sampleID, compound, mvalue, domination) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  mutate_all(~replace(., is.na(.), 0)) %>%
  filter(sampleID != "") %>%
  filter_all(any_vars(!is.na(.))) %>% 
  select(-domination) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(names(ebac_doms_metab_mat) %>% as.data.frame() %>% dplyr::rename(compound = ".") %>% mutate(compound = recode(compound, `Omega-Muricholic Acid` = "3-Epicholic Acid Or Omega-Muricholic Acid")) %>% pull(compound)) %>% 
  filter_all(any_vars(!is.na(.)))

  
vc_ebac_doms_metab_labs <-
  vc_ebac_doms_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(vc_ebac) %>% 
  pull(domination)

dim(vc_ebac_doms_metab_mat) #35 93 (means 93 compounds and 35 LT patients)
## [1] 35 93
length(vc_ebac_doms_metab_labs) #35 (means 35 LT patients)
## [1] 35
# Model metrics for all samples
vc_ebac_doms_tot <- predict(tot_ebac_doms_train_splsda_final,
                        vc_ebac_doms_metab_mat,
                        dist = "all")

vc_ebac_doms_tot_predict <- vc_ebac_doms_tot$class$mahalanobis.dist[,tot_ebac_doms_train_ncomp]

vc_ebac_doms_tot_union <- union(vc_ebac_doms_tot_predict, vc_ebac_doms_metab_labs)

vc_ebac_doms_cm <- confusionMatrix(table(factor(vc_ebac_doms_tot_predict, vc_ebac_doms_tot_union,
                                        levels = c("No Expansion","Expansion")),
                      factor(vc_ebac_doms_metab_labs, vc_ebac_doms_tot_union,
                             levels = c("No Expansion","Expansion"))),
                      positive = "Expansion")
vc_ebac_doms_cm
## Confusion Matrix and Statistics
## 
##               
##                No Expansion Expansion
##   No Expansion           27         6
##   Expansion               1         1
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.6306, 0.9156)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.5993          
##                                           
##                   Kappa : 0.1463          
##                                           
##  Mcnemar's Test P-Value : 0.1306          
##                                           
##             Sensitivity : 0.14286         
##             Specificity : 0.96429         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.81818         
##              Prevalence : 0.20000         
##          Detection Rate : 0.02857         
##    Detection Prevalence : 0.05714         
##       Balanced Accuracy : 0.55357         
##                                           
##        'Positive' Class : Expansion       
## 
# Additional model measures
vc_ebac_doms_epi <- epiR::epi.tests(table(vc_ebac_doms_tot_predict, vc_ebac_doms_metab_labs), conf.level = 0.95)

vc_ebac_doms_confusion_df <- vc_ebac_doms_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nExpansion",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Expansion",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nExpansion" = "Test +",
                "Predicted\nNo Expansion" = "Test -") %>% 
  column_to_rownames(var = "actual")

vc_ebac_doms_confusion_df
# odds ratio
vc_ebac_doms_epi$detail[2][6,]
## [1] 4.5

sPLS-DA Any Infection

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
tot_infx_train_splsda <- mixOmics::splsda(infx_metab_mat, infx_metab_labs, ncomp = 5)

# Performance assessment
## 5-fold, 100-repeat cross validation
set.seed(1234)
tot_infx_train_plsda_perf <-
  perf(
    tot_infx_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  tot_infx_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 1 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
tot_infx_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(5678)
tot_infx_train_tune_splsda <-
  mixOmics::tune.splsda(
    infx_metab_mat,
    infx_metab_labs,
    ncomp = 3, # Choose 3 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = tot_infx_train_keepX,
    nrepeat = 100
  )

plot(tot_infx_train_tune_splsda, col = color.jet(3))

tot_infx_train_error <- tot_infx_train_tune_splsda$error.rate
tot_infx_train_ncomp <- tot_infx_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
tot_infx_train_ncomp #3 component is optimal
## [1] 3
tot_infx_train_select_keepX <- tot_infx_train_tune_splsda$choice.keepX[1:ifelse(tot_infx_train_ncomp == 1, tot_infx_train_ncomp + 1, tot_infx_train_ncomp)]  # optimal number of variables to select per component
tot_infx_train_select_keepX
## comp1 comp2 comp3 
##    90    90    70
# Final Model
tot_infx_train_splsda_final <-
  mixOmics::splsda(infx_metab_mat, infx_metab_labs, ncomp = ifelse(tot_infx_train_ncomp == 1, tot_infx_train_ncomp + 1, tot_infx_train_ncomp), keepX = tot_infx_train_select_keepX)

# Needed for downstream excel file for component loadings
tot_infx_train_splsda_final_copy <- tot_infx_train_splsda_final

vc_infx <- vc_peri_criteria_all %>% 
  filter(bact_infection_present == "Yes") %>%
  group_by(patientID, eday, infx_stool) %>% 
  dplyr::slice(1) %>% 
  group_by(patientID, eday) %>% 
  filter(abs(infx_stool - 0) == min(abs(infx_stool - 0))) %>%
  group_by(patientID) %>% 
  dplyr::slice(1) %>% 
  select(patientID, sampleID, bact_infection_present)
  # ungroup() %>%
  # select(patientID, sampleID, bact_infection_present)

vc_no_infx <- vc_peri_criteria_all %>% 
  filter(patientID %!in% vc_infx$patientID) %>% 
  ungroup() %>% 
  select(patientID, sampleID, bact_infection_present) %>% 
  distinct() %>% 
  right_join(
    vc_first_samps %>% ungroup() %>% mutate(
      sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01")
    ) %>% select(patientID, sampleID) %>% filter(patientID %!in% vc_infx$patientID | sampleID %!in% vc_infx$sampleID)
  ) %>% 
  ungroup() %>% 
  drop_na() %>% 
  distinct(patientID, sampleID, bact_infection_present)

vc_tot_infx <- bind_rows(vc_infx, vc_no_infx)

vc_infx_metab_mat <-
  vc_metab_qual %>% 
  ungroup() %>% 
  select(patientID, sampleID, compound, mvalue) %>% 
  left_join(vc_first_samps %>% mutate(sampleID = recode(sampleID, `vc-011-02` = "vc-011-01", `vc-012-02` = "vc-012-01"))) %>% 
  mutate(compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  ungroup() %>%
  right_join(vc_tot_infx) %>% 
  group_by(sampleID, compound, bact_infection_present) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue, bact_infection_present) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  select(-bact_infection_present) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(names(infx_metab_mat) %>% as.data.frame() %>% dplyr::rename(compound = ".") %>% pull(compound)) %>% 
  filter_all(any_vars(!is.na(.)))

  
vc_infx_metab_labs <-
  vc_infx_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(vc_peri_criteria_all %>% select(sampleID, bact_infection_present) %>% group_by(sampleID) %>% dplyr::slice(1)) %>% 
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  pull(bact_infection_present)

dim(vc_infx_metab_mat) #35 93 (means 93 compounds and 35 LT patients/infections)
## [1] 35 93
length(vc_infx_metab_labs) #35 (means 35 LT patients/infections)
## [1] 35
# Model metrics for all samples
vc_infx_tot <- predict(tot_infx_train_splsda_final,
                        vc_infx_metab_mat,
                        dist = "all")

vc_infx_tot_predict <- vc_infx_tot$class$max.dist[, (ifelse(tot_infx_train_ncomp == 1, tot_infx_train_ncomp + 1, tot_infx_train_ncomp))]

vc_infx_tot_union <- union(vc_infx_tot_predict, vc_infx_metab_labs)

vc_infx_cm <- confusionMatrix(table(factor(vc_infx_tot_predict, vc_infx_tot_union,
                                        levels = c("No Infection", "Infection")),
                      factor(vc_infx_metab_labs, vc_infx_tot_union,
                             levels = c("No Infection", "Infection"))),
                      positive = "Infection")
vc_infx_cm
## Confusion Matrix and Statistics
## 
##               
##                No Infection Infection
##   No Infection           26         4
##   Infection               2         3
##                                           
##                Accuracy : 0.8286          
##                  95% CI : (0.6635, 0.9344)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.4328          
##                                           
##                   Kappa : 0.4             
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.42857         
##             Specificity : 0.92857         
##          Pos Pred Value : 0.60000         
##          Neg Pred Value : 0.86667         
##              Prevalence : 0.20000         
##          Detection Rate : 0.08571         
##    Detection Prevalence : 0.14286         
##       Balanced Accuracy : 0.67857         
##                                           
##        'Positive' Class : Infection       
## 
# as.data.frame(t(data.frame(cbind(t(vc_infx_cm$byClass),t(vc_infx_cm$overall)))))

# Additional model measures
vc_infx_epi <- epiR::epi.tests(table(vc_infx_tot_predict, vc_infx_metab_labs), conf.level = 0.95)

vc_infx_confusion_df <- vc_infx_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nInfection",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Infection",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nInfection" = "Test +",
                "Predicted\nNo Infection" = "Test -") %>% 
  column_to_rownames(var = "actual")

vc_infx_confusion_df
# odds ratio
vc_infx_epi$detail[2][6,]
## [1] 9.75

Supplemental Figure 9: sPLS-DA Model Metrics Training/Test vs Validation

og_metrics <- data.frame(model = c("Diversity Overall: Internal Validation", 
                                   "Diversity Low: Internal Validation",
                                   "Diversity Medium: Internal Validation",
                                   "Diversity High: Internal Validation",
                                   "Enterococcus Expansion: Internal Validation", 
                                   "Enterobacterales Expansion: Internal Validation", 
                                   "Infection: Internal Validation"),
                         acc = c(diversity_cm$overall[1],
                                 diversity_mc_table$Low_Avg[8],
                                 diversity_mc_table$Med_Avg[8],
                                 diversity_mc_table$High_Avg[8],
                                 ecoc_doms_cm$overall[1],
                                 ebac_doms_cm$overall[1],
                                 infx_cm$overall[1]
                                       ),
                         acc_lower = c(diversity_cm$overall[3],
                                       diversity_mc_table$Low_Lower[8],
                                       diversity_mc_table$Med_Lower[8],
                                       diversity_mc_table$High_Lower[8],
                                       ecoc_doms_cm$overall[3],
                                       ebac_doms_cm$overall[3],
                                       infx_cm$overall[3]
                                       ),
                         acc_upper = c(diversity_cm$overall[4],
                                       diversity_mc_table$Low_Upper[8],
                                       diversity_mc_table$Med_Upper[8],
                                       diversity_mc_table$High_Upper[8],
                                       ecoc_doms_cm$overall[4],
                                       ebac_doms_cm$overall[4],
                                       infx_cm$overall[4]),
                         sens = c(NA,
                                  diversity_mc_table$Low_Avg[1],
                                  diversity_mc_table$Med_Avg[1],
                                  diversity_mc_table$High_Avg[1],
                                  ecoc_doms_epi$detail[2][3,],
                                  ebac_doms_epi$detail[2][3,],
                                  infx_epi$detail[2][3,]),
                         sens_lower = c(NA,
                                        diversity_mc_table$Low_Lower[1],
                                        diversity_mc_table$Med_Lower[1],
                                        diversity_mc_table$High_Lower[1],
                                        ecoc_doms_epi$detail[3][3,],
                                        ebac_doms_epi$detail[3][3,],
                                        infx_epi$detail[3][3,]),
                         sens_upper = c(NA,
                                        diversity_mc_table$Low_Upper[1],
                                        diversity_mc_table$Med_Upper[1],
                                        diversity_mc_table$High_Upper[1],
                                        ecoc_doms_epi$detail[4][3,],
                                        ebac_doms_epi$detail[4][3,],
                                        infx_epi$detail[4][3,]),
                         spec = c(NA,
                                  diversity_mc_table$Low_Avg[2],
                                  diversity_mc_table$Med_Avg[2],
                                  diversity_mc_table$High_Avg[2],
                                  ecoc_doms_epi$detail[2][4,],
                                  ebac_doms_epi$detail[2][4,],
                                  infx_epi$detail[2][4,]),
                         spec_lower = c(NA,
                                       diversity_mc_table$Low_Lower[2],
                                       diversity_mc_table$Med_Lower[2],
                                       diversity_mc_table$High_Lower[2],
                                       ecoc_doms_epi$detail[3][4,],
                                       ebac_doms_epi$detail[3][4,],
                                       infx_epi$detail[3][4,]),
                         spec_upper = c(NA,
                                        diversity_mc_table$Low_Upper[2],
                                        diversity_mc_table$Med_Upper[2],
                                        diversity_mc_table$High_Upper[2],
                                       ecoc_doms_epi$detail[4][4,],
                                       ebac_doms_epi$detail[4][4,],
                                       infx_epi$detail[4][4,]),
                         odds = c(NA,
                                  diversity_epi$DOR[1],
                                  diversity_epi$DOR[2],
                                  diversity_epi$DOR[3],
                                  ecoc_doms_epi$detail[2][6,],
                                  ebac_doms_epi$detail[2][6,],
                                  infx_epi$detail[2][6,]),
                         odds_lower = c(NA,
                                        NA,
                                        NA,
                                        NA,
                                        ecoc_doms_epi$detail[3][6,],
                                        ebac_doms_epi$detail[3][6,],
                                        infx_epi$detail[3][6,]),
                         odds_upper = c(NA,
                                        NA,
                                        NA,
                                        NA,
                                        ecoc_doms_epi$detail[4][6,],
                                        ebac_doms_epi$detail[4][6,],
                                        infx_epi$detail[4][6,])
                         )# %>% 
  # pivot_longer(!model, names_to = "metric", values_to = "metric_value") %>% 
  # drop_na()

vc_metrics <- data.frame(model = c("Diversity Overall: External Validation", 
                                   "Diversity Low: External Validation",
                                   "Diversity Medium: External Validation",
                                   "Diversity High: External Validation",
                                   "Enterococcus Expansion: External Validation", 
                                   "Enterobacterales Expansion: External Validation", 
                                   "Infection: External Validation"),
                          acc = c(vc_diversity_cm$overall[1],
                                 vc_diversity_mc_table$Low_Avg[8],
                                 vc_diversity_mc_table$Med_Avg[8],
                                 vc_diversity_mc_table$High_Avg[8],
                                 vc_ecoc_doms_cm$overall[1],
                                 vc_ebac_doms_cm$overall[1],
                                 vc_infx_cm$overall[1]
                                       ),
                         acc_lower = c(vc_diversity_cm$overall[3],
                                       vc_diversity_mc_table$Low_Lower[8],
                                       vc_diversity_mc_table$Med_Lower[8],
                                       vc_diversity_mc_table$High_Lower[8],
                                       vc_ecoc_doms_cm$overall[3],
                                       vc_ebac_doms_cm$overall[3],
                                       vc_infx_cm$overall[3]
                                       ),
                         acc_upper = c(vc_diversity_cm$overall[4],
                                       vc_diversity_mc_table$Low_Upper[8],
                                       vc_diversity_mc_table$Med_Upper[8],
                                       vc_diversity_mc_table$High_Upper[8],
                                       vc_ecoc_doms_cm$overall[4],
                                       vc_ebac_doms_cm$overall[4],
                                       vc_infx_cm$overall[4]),
                         sens = c(NA,
                                  vc_diversity_mc_table$Low_Avg[1],
                                  vc_diversity_mc_table$Med_Avg[1],
                                  vc_diversity_mc_table$High_Avg[1],
                                  vc_ecoc_doms_epi$detail[2][3,],
                                  vc_ebac_doms_epi$detail[2][3,],
                                  vc_infx_epi$detail[2][3,]),
                         sens_lower = c(NA,
                                        vc_diversity_mc_table$Low_Lower[1],
                                        vc_diversity_mc_table$Med_Lower[1],
                                        vc_diversity_mc_table$High_Lower[1],
                                        vc_ecoc_doms_epi$detail[3][3,],
                                        vc_ebac_doms_epi$detail[3][3,],
                                        vc_infx_epi$detail[3][3,]),
                         sens_upper = c(NA,
                                        vc_diversity_mc_table$Low_Upper[1],
                                        vc_diversity_mc_table$Med_Upper[1],
                                        vc_diversity_mc_table$High_Upper[1],
                                        vc_ecoc_doms_epi$detail[4][3,],
                                        vc_ebac_doms_epi$detail[4][3,],
                                        vc_infx_epi$detail[4][3,]),
                         spec = c(NA,
                                  vc_diversity_mc_table$Low_Avg[2],
                                  vc_diversity_mc_table$Med_Avg[2],
                                  vc_diversity_mc_table$High_Avg[2],
                                  vc_ecoc_doms_epi$detail[2][4,],
                                  vc_ebac_doms_epi$detail[2][4,],
                                  vc_infx_epi$detail[2][4,]),
                         spec_lower = c(NA,
                                       vc_diversity_mc_table$Low_Lower[2],
                                       vc_diversity_mc_table$Med_Lower[2],
                                       vc_diversity_mc_table$High_Lower[2],
                                       vc_ecoc_doms_epi$detail[3][4,],
                                       vc_ebac_doms_epi$detail[3][4,],
                                       vc_infx_epi$detail[3][4,]),
                         spec_upper = c(NA,
                                       vc_diversity_mc_table$Low_Upper[2],
                                       vc_diversity_mc_table$Med_Upper[2],
                                       vc_diversity_mc_table$High_Upper[2],
                                       vc_ecoc_doms_epi$detail[4][4,],
                                       vc_ebac_doms_epi$detail[4][4,],
                                       vc_infx_epi$detail[4][4,]),
                         odds = c(NA,
                                  vc_diversity_epi$DOR[1],
                                  vc_diversity_epi$DOR[2],
                                  vc_diversity_epi$DOR[3],
                                  vc_ecoc_doms_epi$detail[2][6,],
                                  vc_ebac_doms_epi$detail[2][6,],
                                  vc_infx_epi$detail[2][6,]),
                         odds_lower = c(NA,
                                        NA,
                                        NA,
                                        NA,
                                        vc_ecoc_doms_epi$detail[3][6,],
                                        vc_ebac_doms_epi$detail[3][6,],
                                        vc_infx_epi$detail[3][6,]),
                         odds_upper = c(NA,
                                        NA,
                                        NA,
                                        NA,
                                        vc_ecoc_doms_epi$detail[4][6,],
                                        vc_ebac_doms_epi$detail[4][6,],
                                        vc_infx_epi$detail[4][6,])
                         ) #%>% 
  # pivot_longer(!c(model, acc_lower, acc_upper, sens_lower, sens_upper, spec_lower, spec_upper, odds_lower, odds_upper), names_to = "metric", values_to = "metric_value") %>% 
  # filter(is.nan(metric_value)|!is.na(metric_value))

metrics_total <- bind_rows(og_metrics, vc_metrics) %>% 
  pivot_longer(!c(model, acc_lower, acc_upper, sens_lower, sens_upper, spec_lower, spec_upper, odds_lower, odds_upper), names_to = "metric", values_to = "metric_point") %>% 
  drop_na(metric_point) %>% 
  pivot_longer(!c(!ends_with("lower")), names_to = "lower_ci", values_to = "lower_ci_value") %>% 
  pivot_longer(!c(!ends_with("upper")), names_to = "upper_ci", values_to = "upper_ci_value") %>% 
  filter(metric == "acc" & lower_ci == "acc_lower" & upper_ci == "acc_upper" |
         metric == "sens" & lower_ci == "sens_lower" & upper_ci == "sens_upper" |
         metric == "spec" & lower_ci == "spec_lower" & upper_ci == "spec_upper" #|
         # metric == "odds" & lower_ci == "odds_lower" & upper_ci == "odds_upper"
         ) %>% 
  mutate(cohort = ifelse(grepl(model, pattern = "Internal"), "Internal Validation", "External Validation"),
         cohort = factor(cohort, levels = c("External Validation", "Internal Validation")),
         model_simple = gsub(model, pattern = ": External Validation|: Internal Validation", replacement = ""),
         model_simple = gsub(model_simple, pattern = "\\s", replacement = "\n"),
         model_simple = factor(model_simple, levels = c(
           "Diversity\nLow",
           "Diversity\nMedium",
           "Diversity\nHigh",
           "Diversity\nOverall",
           "Enterococcus\nExpansion",
           "Enterobacterales\nExpansion",
           "Infection"
         )),
         metric = recode(metric, acc = "Accuracy",
                         sens = "Sensitivity",
                         spec = "Specificity"#,
                         # odds = "Odds Ratio"
                         ),
         metric = factor(metric, levels = c("Accuracy",
                                            "Sensitivity",
                                            "Specificity"#,
                                            # "Odds Ratio"
                                            )))
  # select(-lower_ci, -upper_ci)

gg_metrics_total <- 
metrics_total %>% 
  arrange(desc(cohort)) %>% 
ggplot(., aes(x = metric_point,
              y = cohort,
              color = cohort)) +
geom_errorbar(aes(xmin = lower_ci_value, xmax = upper_ci_value), lwd = 1) +
  geom_point(size = 3) +
  theme_classic2() +
  theme(panel.grid.minor = eb(),
        axis.text.x = et(color = "black", size = 16, angle = 45, hjust = 1, vjust = 1),
        axis.text.y = eb(),
        axis.ticks.y = eb(),
        axis.title.x = eb(),
        axis.title.y = et(color = "black", size = 16),
        legend.title = et(color = "black", size = 16),
        legend.text = et(color = "black", size = 14),
        strip.text.y.left = et(color = "black", size = 16, angle = 0),
        strip.background.y = er(fill = "white"),
        strip.text.x = et(color = "black", size = 16),
        strip.background.x = er(fill = "white"),
        legend.position = "right", 
        panel.spacing.x = unit(2.5, "mm"),
        panel.spacing.y = unit(2, "mm"),
        panel.border = er(colour = "black", fill = NA, linewidth = 1)) +
  facet_grid(model_simple~metric, switch = "y") +
  scale_x_continuous(labels = scales::percent) +
  scale_color_manual(values = c("#8EDFE8", "#333F48FF")) +
  guides(color = guide_legend("Model Cohort", reverse = TRUE, ncol = 1)) +
  # xlab("\nValue") +
  ylab("Model\n")

gg_metrics_total

ggsave(filename = "./Results/Supplemental_Figure_9.pdf", plot = gg_metrics_total, height = 8, width = 14, units = "in")

# save to CSV
metrics_total %>% 
  arrange(metric, model) %>% 
  write.csv(., file = "./Results/Retrained_sPLSDA_Validation_Metrics.csv", row.names = FALSE)

Plot Loadings for Validation Cohort: Diversity Groups

# Take absolute value to make diversity group loadings plot
# narrower
tot_diversity_splsda_final$loadings$X <- abs(tot_diversity_splsda_final$loadings$X)

{
    pdf(file = "./Results/Retrained_Diversity_sPLDSA_Loadings.pdf",
        height = 8, width = 11)

    par(mfrow = c(1, 3))

    # Component 1
    plotLoadings(tot_diversity_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("#EDE342",
            "#F69A97", "#FF51EB"), size.name = 1.1, size.title = rel(1),
        ndisplay = 50)

    # Component 2
    plotLoadings(tot_diversity_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("#EDE342",
            "#F69A97", "#FF51EB"), size.name = 1.1, size.title = rel(1),
        ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))

# Component 1
plotLoadings(tot_diversity_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("#EDE342", "#F69A97",
        "#FF51EB"), size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Component 2
plotLoadings(tot_diversity_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("#EDE342", "#F69A97", "#FF51EB"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

Plot Loadings for Validation Cohort: Enterococcus Expansion

# Take absolute value to make diversity group loadings plot
# narrower
tot_ecoc_doms_train_splsda_final$loadings$X <- abs(tot_ecoc_doms_train_splsda_final$loadings$X)

{
    pdf(file = "./Results/Retrained_Enterococcus_Expansion_sPLDSA_Loadings.pdf",
        height = 8, width = 11)

    par(mfrow = c(1, 3))

    # Component 1
    plotLoadings(tot_ecoc_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("#0C7A3A",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    # Component 2
    plotLoadings(tot_ecoc_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("#0C7A3A",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

# Component 1
plotLoadings(tot_ecoc_doms_train_splsda_final, contrib = "max",
    method = "mean", comp = 1, legend = FALSE, legend.col = c("#0C7A3A",
        "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

# Component 2
plotLoadings(tot_ecoc_doms_train_splsda_final, contrib = "max",
    method = "mean", comp = 2, legend.col = c("#0C7A3A", "black"),
    size.name = 1.1, size.title = rel(1), ndisplay = 50)

Plot Loadings for Validation Cohort: Enterobacterales Expansion

# Take absolute value to make diversity group loadings plot
# narrower
tot_ebac_doms_train_splsda_final$loadings$X <- abs(tot_ebac_doms_train_splsda_final$loadings$X)

{
    pdf(file = "./Results/Retrained_Enterobacterales_Expansion_sPLDSA_Loadings.pdf",
        height = 8, width = 11)

    par(mfrow = c(1, 3))

    # Component 1
    plotLoadings(tot_ebac_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("#FF0000",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    # Component 2
    plotLoadings(tot_ebac_doms_train_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("#FF0000",
            "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))

# Component 1
plotLoadings(tot_ebac_doms_train_splsda_final, contrib = "max",
    method = "mean", comp = 1, legend = FALSE, legend.col = c("#FF0000",
        "black"), size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Component 2
plotLoadings(tot_ebac_doms_train_splsda_final, contrib = "max",
    method = "mean", comp = 2, legend.col = c("#FF0000", "black"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

Plot Loadings for Validation Cohort: Any Bacterial Infection

# Take absolute value to make diversity group loadings plot
# narrower
tot_infx_train_splsda_final$loadings$X <- abs(tot_infx_train_splsda_final$loadings$X)

{
    pdf(file = "./Results/Retrained_Infection_sPLDSA_Loadings.pdf",
        height = 8, width = 11)

    par(mfrow = c(1, 3))

    # Component 1
    plotLoadings(tot_infx_train_splsda_final, contrib = "max",
        method = "mean", comp = 1, legend = FALSE, legend.col = c("goldenrod",
            "gray75"), size.name = 1.1, size.title = rel(1),
        ndisplay = 50)

    # Component 2
    plotLoadings(tot_infx_train_splsda_final, contrib = "max",
        method = "mean", comp = 2, legend.col = c("goldenrod",
            "gray75"), size.name = 1.1, size.title = rel(1),
        ndisplay = 50)

    invisible(dev.off())
}

par(mfrow = c(1, 3))

# Component 1
plotLoadings(tot_infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("goldenrod", "gray75"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Component 2
plotLoadings(tot_infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("goldenrod", "gray75"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Export All Plot Loadings from Re-trained Models

tot_diversity_loadings_comp1 <- plotLoadings(tot_diversity_splsda_final_copy,
    contrib = "max", method = "mean", legend = FALSE, comp = 1,
    legend.col = c("#EDE342", "#F69A97", "#FF51EB"), size.name = 1.1,
    size.title = rel(1), ndisplay = 50)

tot_diversity_loadings_comp2 <- plotLoadings(tot_diversity_splsda_final_copy,
    contrib = "max", method = "mean", legend = FALSE, comp = 2,
    legend.col = c("#EDE342", "#F69A97", "#FF51EB"), size.name = 1.1,
    size.title = rel(1), ndisplay = 50)

tot_ecoc_loadings_comp1 <- plotLoadings(tot_ecoc_doms_train_splsda_final,
    contrib = "max", method = "mean", comp = 1, legend.col = c("#0C7A3A",
        "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

tot_ecoc_loadings_comp2 <- plotLoadings(tot_ecoc_doms_train_splsda_final,
    contrib = "max", method = "mean", comp = 2, legend.col = c("#0C7A3A",
        "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

tot_ebac_loadings_comp1 <- plotLoadings(tot_ebac_doms_train_splsda_final,
    contrib = "max", method = "mean", comp = 1, legend = FALSE,
    legend.col = c("#FF0000", "black"), size.name = 1.1, size.title = rel(1),
    ndisplay = 50)

tot_ebac_loadings_comp2 <- plotLoadings(tot_ebac_doms_train_splsda_final,
    contrib = "max", method = "mean", comp = 2, legend.col = c("#FF0000",
        "black"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

tot_infx_loadings_comp1 <- plotLoadings(tot_infx_train_splsda_final,
    contrib = "max", method = "mean", comp = 1, legend = FALSE,
    legend.col = c("goldenrod", "gray75"), size.name = 1.1, size.title = rel(1),
    ndisplay = 50)

tot_infx_loadings_comp2 <- plotLoadings(tot_infx_train_splsda_final,
    contrib = "max", method = "mean", comp = 2, legend.col = c("goldenrod",
        "gray75"), size.name = 1.1, size.title = rel(1), ndisplay = 50)

{
    # Diversity Component 1
    wb1 <- tot_diversity_loadings_comp1 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Diversity Component 2
    wb2 <- tot_diversity_loadings_comp2 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Enterococcus Expansion Component 1
    wb3 <- tot_ecoc_loadings_comp1 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Enterococcus Expansion Component 2
    wb4 <- tot_ecoc_loadings_comp2 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Enterobacterales Expansion Component 1
    wb5 <- tot_ebac_loadings_comp1 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Enterobacterales Expansion Component 2
    wb6 <- tot_ebac_loadings_comp2 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Any Bacterial Infection Component 1
    wb7 <- tot_infx_loadings_comp1 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Any Bacterial Infection Component 2
    wb8 <- tot_infx_loadings_comp2 %>%
        as.data.frame() %>%
        rename_with(~stringr::str_replace(.x, pattern = "X\\.",
            replacement = ""), matches("X.")) %>%
        rename_with(~stringr::str_to_lower(.x)) %>%
        rownames_to_column(var = "compound") %>%
        left_join(heatmap_order %>%
            select(compound, class, subclass)) %>%
        dplyr::relocate(class, subclass, .after = "compound")

    # Define sheet names for each data frame
    wb_names <- list(Diversity_1 = wb1, Diversity_2 = wb2, Enterococcus_Expansion_1 = wb3,
        Enterococcus_Expansion_2 = wb4, Enterobacterales_Expansion_1 = wb5,
        Enterobacterales_Expansion_2 = wb6, Infection_1 = wb7,
        Infection_2 = wb8)
    # Export each data frame to separate sheets in same
    # Excel file
    openxlsx::write.xlsx(wb_names, file = "./Results/Retrained_Model_sPLSDA_Loadings.xlsx",
        tabColour = c("#EDE342", "#EDE342", "#0C7A3A", "#0C7A3A",
            "#FF0000", "#FF0000", "goldenrod", "goldenrod"),
        zoom = 90)

}

Demographics

## Vector of variables to summarize
vc_demo_vars <- c("race", "sex", "age", "meld_transplant", "Alcoholic Hepatitis",
    "Alcoholic Cirrhosis", "NAFLD/NASH", "Primary Sclerosing Cholangitis",
    "Acute Viral Hepatitis", "Chronic Hepatitis B", "Chronic Hepatitis C",
    "Autoimmune", "Wilson's Disease", "Alpha-1 Antitrypsin",
    "Hemachromatosis", "Drug Induced Liver Injury or Toxin",
    "Budd Chiari", "Cryptogenic", "Malignancy", "Other", "Dialysis",
    "Pressers", "Mechanical Ventilation")
## Vector of categorical variables that need transformation
vc_demo_cats <- c("race", "sex", "Alcoholic Hepatitis", "Alcoholic Cirrhosis",
    "NAFLD/NASH", "Primary Sclerosing Cholangitis", "Acute Viral Hepatitis",
    "Chronic Hepatitis B", "Chronic Hepatitis C", "Autoimmune",
    "Wilson's Disease", "Alpha-1 Antitrypsin", "Hemachromatosis",
    "Drug Induced Liver Injury or Toxin", "Budd Chiari", "Cryptogenic",
    "Malignancy", "Other", "Dialysis", "Pressers", "Mechanical Ventilation")


vc_tab1_1 <- CreateTableOne(vars = vc_demo_vars, testNonNormal = "kruskal.test",
    includeNA = TRUE, factorVars = vc_demo_cats, strata = "any_infection",
    data = vc_demo)

summary(vc_tab1_1)  # Age is potentially skewed, need to state that it is skewed and re-run `CreateTableOne`
## 
##      ### Summary of continuous variables ###
## 
## any_infection: 0
##                  n miss p.miss mean sd median p25 p75 min max skew kurt
## age             28    0      0   51 12     54  43  61  23  71 -0.5 -0.3
## meld_transplant 28    0      0   28  9     28  22  32  14  46  0.3 -0.6
## ------------------------------------------------------------ 
## any_infection: 1
##                 n miss p.miss mean sd median p25 p75 min max skew kurt
## age             7    0      0   52 19     59  48  64  15  68 -1.6  1.8
## meld_transplant 7    0      0   26 12     28  20  32   6  43 -0.5 -0.1
## 
## p-values
##                   pNormal pNonNormal
## age             0.8542412  0.3860101
## meld_transplant 0.6861052  0.8042618
## 
## Standardize mean differences
##                     1 vs 2
## age             0.06662741
## meld_transplant 0.15442682
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## any_infection: 0
##                                 var  n miss p.miss
##                                race 28    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 28    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 28    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 28    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 28    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 28    0    0.0
##                                                   
##               Acute Viral Hepatitis 28    0    0.0
##                                                   
##                 Chronic Hepatitis B 28    0    0.0
##                                                   
##                 Chronic Hepatitis C 28    0    0.0
##                                                   
##                                                   
##                          Autoimmune 28    0    0.0
##                                                   
##                    Wilson's Disease 28    0    0.0
##                                                   
##                 Alpha-1 Antitrypsin 28    0    0.0
##                                                   
##                     Hemachromatosis 28    0    0.0
##                                                   
##  Drug Induced Liver Injury or Toxin 28    0    0.0
##                                                   
##                         Budd Chiari 28    0    0.0
##                                                   
##                         Cryptogenic 28    0    0.0
##                                                   
##                          Malignancy 28    0    0.0
##                                                   
##                                                   
##                               Other 28    0    0.0
##                                                   
##                                                   
##                            Dialysis 28    0    0.0
##                                                   
##                                                   
##                            Pressers 28    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 28    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    0     0.0         0.0
##            Black/African-American    3    10.7        10.7
##                More than one Race    1     3.6        14.3
##            Other Pacific Islander    1     3.6        17.9
##                           Unknown    3    10.7        28.6
##                             White   20    71.4       100.0
##                                                           
##                            Female   12    42.9        42.9
##                              Male   16    57.1       100.0
##                                                           
##                                 0   27    96.4        96.4
##                                 1    1     3.6       100.0
##                                                           
##                                 0   17    60.7        60.7
##                                 1   11    39.3       100.0
##                                                           
##                                 0   27    96.4        96.4
##                                 1    1     3.6       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   27    96.4        96.4
##                                 1    1     3.6       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   28   100.0       100.0
##                                                           
##                                 0   27    96.4        96.4
##                                 1    1     3.6       100.0
##                                                           
##                                 0   25    89.3        89.3
##                                 1    3    10.7       100.0
##                                                           
##                                 0   16    57.1        57.1
##                                 1   12    42.9       100.0
##                                                           
##                                 0   21    75.0        75.0
##                                 1    7    25.0       100.0
##                                                           
##                                 0   25    89.3        89.3
##                                 1    3    10.7       100.0
##                                                           
## ------------------------------------------------------------ 
## any_infection: 1
##                                 var n miss p.miss
##                                race 7    0    0.0
##                                                  
##                                                  
##                                                  
##                                                  
##                                                  
##                                                  
##                                 sex 7    0    0.0
##                                                  
##                                                  
##                 Alcoholic Hepatitis 7    0    0.0
##                                                  
##                                                  
##                 Alcoholic Cirrhosis 7    0    0.0
##                                                  
##                                                  
##                          NAFLD/NASH 7    0    0.0
##                                                  
##                                                  
##      Primary Sclerosing Cholangitis 7    0    0.0
##                                                  
##               Acute Viral Hepatitis 7    0    0.0
##                                                  
##                 Chronic Hepatitis B 7    0    0.0
##                                                  
##                 Chronic Hepatitis C 7    0    0.0
##                                                  
##                                                  
##                          Autoimmune 7    0    0.0
##                                                  
##                    Wilson's Disease 7    0    0.0
##                                                  
##                 Alpha-1 Antitrypsin 7    0    0.0
##                                                  
##                     Hemachromatosis 7    0    0.0
##                                                  
##  Drug Induced Liver Injury or Toxin 7    0    0.0
##                                                  
##                         Budd Chiari 7    0    0.0
##                                                  
##                         Cryptogenic 7    0    0.0
##                                                  
##                          Malignancy 7    0    0.0
##                                                  
##                                                  
##                               Other 7    0    0.0
##                                                  
##                                                  
##                            Dialysis 7    0    0.0
##                                                  
##                                                  
##                            Pressers 7    0    0.0
##                                                  
##                                                  
##              Mechanical Ventilation 7    0    0.0
##                                                  
##                                                  
##                             level freq percent cum.percent
##  American Indian or Alaska Native    1    14.3        14.3
##            Black/African-American    0     0.0        14.3
##                More than one Race    0     0.0        14.3
##            Other Pacific Islander    0     0.0        14.3
##                           Unknown    1    14.3        28.6
##                             White    5    71.4       100.0
##                                                           
##                            Female    4    57.1        57.1
##                              Male    3    42.9       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0    4    57.1        57.1
##                                 1    3    42.9       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    6    85.7        85.7
##                                 1    1    14.3       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0    5    71.4        71.4
##                                 1    2    28.6       100.0
##                                                           
##                                 0    4    57.1        57.1
##                                 1    3    42.9       100.0
##                                                           
##                                 0    5    71.4        71.4
##                                 1    2    28.6       100.0
##                                                           
##                                 0    7   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
## 
## p-values
##                                      pApprox    pExact
## race                               0.3789440 0.5680822
## sex                                0.7991226 0.6772171
## Alcoholic Hepatitis                1.0000000 1.0000000
## Alcoholic Cirrhosis                1.0000000 1.0000000
## NAFLD/NASH                         1.0000000 1.0000000
## Primary Sclerosing Cholangitis            NA        NA
## Acute Viral Hepatitis                     NA        NA
## Chronic Hepatitis B                       NA        NA
## Chronic Hepatitis C                0.8555397 0.3647059
## Autoimmune                                NA        NA
## Wilson's Disease                          NA        NA
## Alpha-1 Antitrypsin                       NA        NA
## Hemachromatosis                           NA        NA
## Drug Induced Liver Injury or Toxin        NA        NA
## Budd Chiari                               NA        NA
## Cryptogenic                               NA        NA
## Malignancy                         1.0000000 1.0000000
## Other                              0.5459717 0.2557573
## Dialysis                           1.0000000 1.0000000
## Pressers                           1.0000000 1.0000000
## Mechanical Ventilation             0.8800137 1.0000000
## 
## Standardize mean differences
##                                        1 vs 2
## race                               0.88345221
## sex                                0.28867513
## Alcoholic Hepatitis                0.27216553
## Alcoholic Cirrhosis                0.07264327
## NAFLD/NASH                         0.27216553
## Primary Sclerosing Cholangitis     0.00000000
## Acute Viral Hepatitis              0.00000000
## Chronic Hepatitis B                0.00000000
## Chronic Hepatitis C                0.38254603
## Autoimmune                         0.00000000
## Wilson's Disease                   0.00000000
## Alpha-1 Antitrypsin                0.00000000
## Hemachromatosis                    0.00000000
## Drug Induced Liver Injury or Toxin 0.00000000
## Budd Chiari                        0.00000000
## Cryptogenic                        0.00000000
## Malignancy                         0.27216553
## Other                              0.46126560
## Dialysis                           0.00000000
## Pressers                           0.08071343
## Mechanical Ventilation             0.48989795
vc_tableone_skewed <- c("age", "meld_transplant")

vc_tab1_2 <- print(vc_tab1_1, nonnormal = vc_tableone_skewed,
    formatOptions = list(big.mark = ","))
##                                             Stratified by any_infection
##                                              0                   
##   n                                             28               
##   race (%)                                                       
##      American Indian or Alaska Native            0 (  0.0)       
##      Black/African-American                      3 ( 10.7)       
##      More than one Race                          1 (  3.6)       
##      Other Pacific Islander                      1 (  3.6)       
##      Unknown                                     3 ( 10.7)       
##      White                                      20 ( 71.4)       
##   sex = Male (%)                                16 ( 57.1)       
##   age (median [IQR])                         54.00 [43.00, 61.25]
##   meld_transplant (median [IQR])             28.50 [21.75, 32.25]
##   Alcoholic Hepatitis = 1 (%)                    1 (  3.6)       
##   Alcoholic Cirrhosis = 1 (%)                   11 ( 39.3)       
##   NAFLD/NASH = 1 (%)                             1 (  3.6)       
##   Primary Sclerosing Cholangitis = 0 (%)        28 (100.0)       
##   Acute Viral Hepatitis = 0 (%)                 28 (100.0)       
##   Chronic Hepatitis B = 0 (%)                   28 (100.0)       
##   Chronic Hepatitis C = 1 (%)                    1 (  3.6)       
##   Autoimmune = 0 (%)                            28 (100.0)       
##   Wilson's Disease = 0 (%)                      28 (100.0)       
##   Alpha-1 Antitrypsin = 0 (%)                   28 (100.0)       
##   Hemachromatosis = 0 (%)                       28 (100.0)       
##   Drug Induced Liver Injury or Toxin = 0 (%)    28 (100.0)       
##   Budd Chiari = 0 (%)                           28 (100.0)       
##   Cryptogenic = 0 (%)                           28 (100.0)       
##   Malignancy = 1 (%)                             1 (  3.6)       
##   Other = 1 (%)                                  3 ( 10.7)       
##   Dialysis = 1 (%)                              12 ( 42.9)       
##   Pressers = 1 (%)                               7 ( 25.0)       
##   Mechanical Ventilation = 1 (%)                 3 ( 10.7)       
##                                             Stratified by any_infection
##                                              1                    p     
##   n                                              7                      
##   race (%)                                                         0.379
##      American Indian or Alaska Native            1 ( 14.3)              
##      Black/African-American                      0 (  0.0)              
##      More than one Race                          0 (  0.0)              
##      Other Pacific Islander                      0 (  0.0)              
##      Unknown                                     1 ( 14.3)              
##      White                                       5 ( 71.4)              
##   sex = Male (%)                                 3 ( 42.9)         0.799
##   age (median [IQR])                         59.00 [48.00, 64.50]  0.386
##   meld_transplant (median [IQR])             28.00 [20.50, 32.00]  0.804
##   Alcoholic Hepatitis = 1 (%)                    0 (  0.0)         1.000
##   Alcoholic Cirrhosis = 1 (%)                    3 ( 42.9)         1.000
##   NAFLD/NASH = 1 (%)                             0 (  0.0)         1.000
##   Primary Sclerosing Cholangitis = 0 (%)         7 (100.0)            NA
##   Acute Viral Hepatitis = 0 (%)                  7 (100.0)            NA
##   Chronic Hepatitis B = 0 (%)                    7 (100.0)            NA
##   Chronic Hepatitis C = 1 (%)                    1 ( 14.3)         0.856
##   Autoimmune = 0 (%)                             7 (100.0)            NA
##   Wilson's Disease = 0 (%)                       7 (100.0)            NA
##   Alpha-1 Antitrypsin = 0 (%)                    7 (100.0)            NA
##   Hemachromatosis = 0 (%)                        7 (100.0)            NA
##   Drug Induced Liver Injury or Toxin = 0 (%)     7 (100.0)            NA
##   Budd Chiari = 0 (%)                            7 (100.0)            NA
##   Cryptogenic = 0 (%)                            7 (100.0)            NA
##   Malignancy = 1 (%)                             0 (  0.0)         1.000
##   Other = 1 (%)                                  2 ( 28.6)         0.546
##   Dialysis = 1 (%)                               3 ( 42.9)         1.000
##   Pressers = 1 (%)                               2 ( 28.6)         1.000
##   Mechanical Ventilation = 1 (%)                 0 (  0.0)         0.880
##                                             Stratified by any_infection
##                                              test   
##   n                                                 
##   race (%)                                          
##      American Indian or Alaska Native               
##      Black/African-American                         
##      More than one Race                             
##      Other Pacific Islander                         
##      Unknown                                        
##      White                                          
##   sex = Male (%)                                    
##   age (median [IQR])                         nonnorm
##   meld_transplant (median [IQR])             nonnorm
##   Alcoholic Hepatitis = 1 (%)                       
##   Alcoholic Cirrhosis = 1 (%)                       
##   NAFLD/NASH = 1 (%)                                
##   Primary Sclerosing Cholangitis = 0 (%)            
##   Acute Viral Hepatitis = 0 (%)                     
##   Chronic Hepatitis B = 0 (%)                       
##   Chronic Hepatitis C = 1 (%)                       
##   Autoimmune = 0 (%)                                
##   Wilson's Disease = 0 (%)                          
##   Alpha-1 Antitrypsin = 0 (%)                       
##   Hemachromatosis = 0 (%)                           
##   Drug Induced Liver Injury or Toxin = 0 (%)        
##   Budd Chiari = 0 (%)                               
##   Cryptogenic = 0 (%)                               
##   Malignancy = 1 (%)                                
##   Other = 1 (%)                                     
##   Dialysis = 1 (%)                                  
##   Pressers = 1 (%)                                  
##   Mechanical Ventilation = 1 (%)
write.csv(vc_tab1_2, "./Results/Validation_Cohort_Demo_Table_1.csv",
    row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
vc_tab1_2_padjust1 <- read.csv("./Results/Validation_Cohort_Demo_Table_1.csv") %>%
    dplyr::rename(` ` = X, `No Infection` = X0, `Bacterial Infection` = X1)

vc_tab1_2_padjust2 <- vc_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = vc_tab1_2_padjust1$` `))

vc_tab1_2_padjust3 <- vc_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & test == "", "chi.sq", test)) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, vc_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(vc_tab1_2_padjust3, "./Results/Validation_Cohort_Demo_Table_1_padjust.csv",
    row.names = FALSE)

Antibiotics

## Vector of variables to summarize
vc_abx_vars <- c("Basiliximab", "Mycophenolate", "Steroid", "Systemic Vancomycin",
    "Tacrolimus", "Cefepime", "Metronidazole", "Piperacillin/Tazobactam",
    "Rifaximin", "Ceftriaxone", "Ciprofloxacin", "Gentamicin",
    "Tobramicin", "Daptomycin", "Meropenem", "Oral Vancomycin")

vc_abx2 <- vc_abx %>%
    filter(grepl(pattern = "basilix|tacro|steroid|mycophenolate|lactulose",
        medication_name, ignore.case = T) | grepl(pattern = "GLUCOCORTICOIDS|steroid",
        pharm_class, ignore.case = T) | grepl(pattern = "steroid",
        pharm_sub_class, ignore.case = T) | grepl("given", mar_action,
        ignore.case = T)) %>%
    mutate(Immunosuppressants = case_when(grepl("basilix", medication_name,
        ignore.case = T) ~ "Basiliximab", grepl("tacro", medication_name,
        ignore.case = T) ~ "Tacrolimus", grepl("one|ide|solu-cortef",
        medication_name, ignore.case = T) ~ "Steroid", grepl("mycophenolate",
        medication_name, ignore.case = T) ~ "Mycophenolate",
        grepl("lactulose", medication_name, ignore.case = T) ~
            "Lactulose"), Antibiotics = case_when(grepl("rifaximin",
        medication_name, ignore.case = T) ~ "Rifaximin", grepl("lactulose",
        medication_name, ignore.case = T) ~ "Lactulose", grepl("ceftriaxone",
        medication_name, ignore.case = T) ~ "Ceftriaxone", grepl("piperacillin|tazobactam",
        medication_name, ignore.case = T) ~ "Piperacillin/Tazobactam",
        grepl("cefepime", medication_name, ignore.case = T) ~
            "Cefepime", grepl("meropenem", medication_name, ignore.case = T) ~
            "Meropenem", grepl("gentamicin", medication_name,
            ignore.case = T) ~ "Gentamicin", grepl("tobramycin",
            medication_name, ignore.case = T) ~ "Tobramicin",
        grepl("vancomycin.+oral", medication_name, ignore.case = T) ~
            "Oral Vancomycin", grepl("vancomycin.+(IV|Intravenous)",
            medication_name, ignore.case = T) ~ "Systemic Vancomycin",
        grepl("METRONIDAZOLE", medication_name, ignore.case = T) ~
            "Metronidazole", grepl("DAPTOMYCIN", medication_name,
            ignore.case = T) ~ "Daptomycin", grepl("linezolid",
            medication_name, ignore.case = T) ~ "Linezolid",
        grepl("fluconazole", medication_name, ignore.case = T) ~
            "Fluconazole", grepl("micafungin", medication_name,
            ignore.case = T) ~ "Micafungin", grepl("cipro", medication_name,
            ignore.case = T) & !grepl("drop", dose_units, ignore.case = T) ~
            "Ciprofloxacin"), action = case_when(!is.na(Immunosuppressants) &
        between(days_transplant, 0, 30) | !is.na(Immunosuppressants) &
        ordering_mode == "Outpatient" ~ "keep", !is.na(Antibiotics) &
        between(days_transplant, -14, 1) ~ "keep", TRUE ~ "remove")) %>%
    group_by(patientID, Immunosuppressants, Antibiotics) %>%
    arrange(days_transplant) %>%
    filter(action == "keep") %>%
    dplyr::slice(1) %>%
    select(patientID, bact_infection_present, Immunosuppressants,
        Antibiotics) %>%
    # left_join(peri_criteria_all %>% select(patientID,
    # any_infection)) %>%
pivot_longer(!c(patientID, bact_infection_present), names_to = "variable",
    values_to = "value") %>%
    drop_na(value) %>%
    mutate(variable = 1) %>%
    pivot_wider(id_cols = c(patientID, bact_infection_present),
        names_from = "value", values_from = "variable", values_fn = min) %>%
    replace(is.na(.), 0) %>%
    mutate(bact_infection_present = recode(bact_infection_present,
        No = "No Infection", Yes = "Bacterial Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("No Infection", "Bacterial Infection")))

vc_abx_tab1_1 <- CreateTableOne(vars = vc_abx_vars, testNonNormal = "kruskal.test",
    includeNA = FALSE, factorVars = vc_abx_vars, strata = "bact_infection_present",
    data = vc_abx2)

summary(vc_abx_tab1_1)
## 
##      ### Summary of categorical variables ### 
## 
## bact_infection_present: No Infection
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 28    0    0.0     0   11    39.3        39.3
##                                             1   17    60.7       100.0
##                                                                       
##            Mycophenolate 28    0    0.0     0    4    14.3        14.3
##                                             1   24    85.7       100.0
##                                                                       
##                  Steroid 28    0    0.0     1   28   100.0       100.0
##                                                                       
##      Systemic Vancomycin 28    0    0.0     0   10    35.7        35.7
##                                             1   18    64.3       100.0
##                                                                       
##               Tacrolimus 28    0    0.0     1   28   100.0       100.0
##                                                                       
##                 Cefepime 28    0    0.0     0   16    57.1        57.1
##                                             1   12    42.9       100.0
##                                                                       
##            Metronidazole 28    0    0.0     0   10    35.7        35.7
##                                             1   18    64.3       100.0
##                                                                       
##  Piperacillin/Tazobactam 28    0    0.0     0    5    17.9        17.9
##                                             1   23    82.1       100.0
##                                                                       
##                Rifaximin 28    0    0.0     0   12    42.9        42.9
##                                             1   16    57.1       100.0
##                                                                       
##              Ceftriaxone 28    0    0.0     0   22    78.6        78.6
##                                             1    6    21.4       100.0
##                                                                       
##            Ciprofloxacin 28    0    0.0     0   21    75.0        75.0
##                                             1    7    25.0       100.0
##                                                                       
##               Gentamicin 28    0    0.0     0   27    96.4        96.4
##                                             1    1     3.6       100.0
##                                                                       
##               Tobramicin 28    0    0.0     0   22    78.6        78.6
##                                             1    6    21.4       100.0
##                                                                       
##               Daptomycin 28    0    0.0     0   27    96.4        96.4
##                                             1    1     3.6       100.0
##                                                                       
##                Meropenem 28    0    0.0     0   28   100.0       100.0
##                                             1    0     0.0       100.0
##                                                                       
##          Oral Vancomycin 28    0    0.0     0   27    96.4        96.4
##                                             1    1     3.6       100.0
##                                                                       
## ------------------------------------------------------------ 
## bact_infection_present: Bacterial Infection
##                      var n miss p.miss level freq percent cum.percent
##              Basiliximab 7    0    0.0     0    2    28.6        28.6
##                                            1    5    71.4       100.0
##                                                                      
##            Mycophenolate 7    0    0.0     0    1    14.3        14.3
##                                            1    6    85.7       100.0
##                                                                      
##                  Steroid 7    0    0.0     1    7   100.0       100.0
##                                                                      
##      Systemic Vancomycin 7    0    0.0     0    1    14.3        14.3
##                                            1    6    85.7       100.0
##                                                                      
##               Tacrolimus 7    0    0.0     1    7   100.0       100.0
##                                                                      
##                 Cefepime 7    0    0.0     0    5    71.4        71.4
##                                            1    2    28.6       100.0
##                                                                      
##            Metronidazole 7    0    0.0     0    5    71.4        71.4
##                                            1    2    28.6       100.0
##                                                                      
##  Piperacillin/Tazobactam 7    0    0.0     0    2    28.6        28.6
##                                            1    5    71.4       100.0
##                                                                      
##                Rifaximin 7    0    0.0     0    2    28.6        28.6
##                                            1    5    71.4       100.0
##                                                                      
##              Ceftriaxone 7    0    0.0     0    4    57.1        57.1
##                                            1    3    42.9       100.0
##                                                                      
##            Ciprofloxacin 7    0    0.0     0    6    85.7        85.7
##                                            1    1    14.3       100.0
##                                                                      
##               Gentamicin 7    0    0.0     0    7   100.0       100.0
##                                            1    0     0.0       100.0
##                                                                      
##               Tobramicin 7    0    0.0     0    3    42.9        42.9
##                                            1    4    57.1       100.0
##                                                                      
##               Daptomycin 7    0    0.0     0    6    85.7        85.7
##                                            1    1    14.3       100.0
##                                                                      
##                Meropenem 7    0    0.0     0    6    85.7        85.7
##                                            1    1    14.3       100.0
##                                                                      
##          Oral Vancomycin 7    0    0.0     0    7   100.0       100.0
##                                            1    0     0.0       100.0
##                                                                      
## 
## p-values
##                           pApprox    pExact
## Basiliximab             0.9303088 0.6888863
## Mycophenolate           1.0000000 1.0000000
## Steroid                        NA        NA
## Systemic Vancomycin     0.5240000 0.3916280
## Tacrolimus                     NA        NA
## Cefepime                0.7958092 0.6760304
## Metronidazole           0.2002397 0.1122199
## Piperacillin/Tazobactam 0.9158646 0.6078263
## Rifaximin               0.7958092 0.6760304
## Ceftriaxone             0.4985291 0.3397075
## Ciprofloxacin           0.9198406 1.0000000
## Gentamicin              1.0000000 1.0000000
## Tobramicin              0.1605806 0.1553524
## Daptomycin              0.8555397 0.3647059
## Meropenem               0.4466872 0.2000000
## Oral Vancomycin         1.0000000 1.0000000
## 
## Standardize mean differences
##                            1 vs 2
## Basiliximab             0.2277569
## Mycophenolate           0.0000000
## Steroid                 0.0000000
## Systemic Vancomycin     0.5107539
## Tacrolimus              0.0000000
## Cefepime                0.3015113
## Metronidazole           0.7669650
## Piperacillin/Tazobactam 0.2558409
## Rifaximin               0.3015113
## Ceftriaxone             0.4714045
## Ciprofloxacin           0.2721655
## Gentamicin              0.2721655
## Tobramicin              0.7856742
## Daptomycin              0.3825460
## Meropenem               0.5773503
## Oral Vancomycin         0.2721655
vc_abx_tab1_2 <- print(vc_abx_tab1_1, formatOptions = list(big.mark = ","))
##                                  Stratified by bact_infection_present
##                                   No Infection Bacterial Infection p      test
##   n                               28           7                              
##   Basiliximab = 1 (%)             17 ( 60.7)   5 ( 71.4)            0.930     
##   Mycophenolate = 1 (%)           24 ( 85.7)   6 ( 85.7)            1.000     
##   Steroid = 1 (%)                 28 (100.0)   7 (100.0)               NA     
##   Systemic Vancomycin = 1 (%)     18 ( 64.3)   6 ( 85.7)            0.524     
##   Tacrolimus = 1 (%)              28 (100.0)   7 (100.0)               NA     
##   Cefepime = 1 (%)                12 ( 42.9)   2 ( 28.6)            0.796     
##   Metronidazole = 1 (%)           18 ( 64.3)   2 ( 28.6)            0.200     
##   Piperacillin/Tazobactam = 1 (%) 23 ( 82.1)   5 ( 71.4)            0.916     
##   Rifaximin = 1 (%)               16 ( 57.1)   5 ( 71.4)            0.796     
##   Ceftriaxone = 1 (%)              6 ( 21.4)   3 ( 42.9)            0.499     
##   Ciprofloxacin = 1 (%)            7 ( 25.0)   1 ( 14.3)            0.920     
##   Gentamicin = 1 (%)               1 (  3.6)   0 (  0.0)            1.000     
##   Tobramicin = 1 (%)               6 ( 21.4)   4 ( 57.1)            0.161     
##   Daptomycin = 1 (%)               1 (  3.6)   1 ( 14.3)            0.856     
##   Meropenem = 1 (%)                0 (  0.0)   1 ( 14.3)            0.447     
##   Oral Vancomycin = 1 (%)          1 (  3.6)   0 (  0.0)            1.000
write.csv(vc_abx_tab1_2, "./Results/Validation_Cohort_ABX_Table_1.csv",
    row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
vc_abx_tab1_2_padjust1 <- read.csv("./Results/Validation_Cohort_ABX_Table_1.csv") %>%
    dplyr::rename(` ` = X, `Bacterial Infection` = Bacterial.Infection,
        `No Infection` = No.Infection)

vc_abx_tab1_2_padjust2 <- vc_abx_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = vc_abx_tab1_2_padjust1$` `))

vc_abx_tab1_2_padjust3 <- vc_abx_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & is.na(test), "chi.sq", "")) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, vc_abx_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(vc_abx_tab1_2_padjust3, "./Results/Validation_Cohort_ABX_Table_1_padjust.csv",
    row.names = FALSE)

Compare cohorts

Demographics

demo_cohorts <- vc_demo %>%
    mutate(cohort = "Validation") %>%
    bind_rows(demo %>%
        mutate(cohort = "Original"))


cohorts_tab1_1 <- CreateTableOne(vars = demo_vars, testNonNormal = "kruskal.test",
    includeNA = TRUE, factorVars = demo_cats, strata = "cohort",
    data = demo_cohorts)

summary(cohorts_tab1_1)  # Age is potentially skewed, need to state that it is skewed and re-run `CreateTableOne`
## 
##      ### Summary of continuous variables ###
## 
## cohort: Original
##                   n miss p.miss mean sd median p25 p75 min max  skew kurt
## age             107    0      0   52 16     56  42  64   2  77 -0.99  0.7
## meld_transplant 107    0      0   26 10     29  19  33   6  49 -0.09 -0.8
## ------------------------------------------------------------ 
## cohort: Validation
##                  n miss p.miss mean sd median p25 p75 min max  skew kurt
## age             35    0      0   52 13     55  43  62  15  71 -0.87  0.3
## meld_transplant 35    0      0   27 10     28  22  32   6  46 -0.01 -0.4
## 
## p-values
##                   pNormal pNonNormal
## age             0.8978311  0.5811413
## meld_transplant 0.6659789  0.8182777
## 
## Standardize mean differences
##                     1 vs 2
## age             0.02619913
## meld_transplant 0.08548448
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## cohort: Original
##                                 var   n miss p.miss
##                                race 107    0    0.0
##                                                    
##                                                    
##                                                    
##                                                    
##                                                    
##                                                    
##                                                    
##                                                    
##                                 sex 107    0    0.0
##                                                    
##                                                    
##                 Alcoholic Hepatitis 107    0    0.0
##                                                    
##                                                    
##                 Alcoholic Cirrhosis 107    0    0.0
##                                                    
##                                                    
##                          NAFLD/NASH 107    0    0.0
##                                                    
##                                                    
##      Primary Sclerosing Cholangitis 107    0    0.0
##                                                    
##                                                    
##               Acute Viral Hepatitis 107    0    0.0
##                                                    
##                                                    
##                 Chronic Hepatitis B 107    0    0.0
##                                                    
##                                                    
##                 Chronic Hepatitis C 107    0    0.0
##                                                    
##                                                    
##                          Autoimmune 107    0    0.0
##                                                    
##                                                    
##                    Wilson's Disease 107    0    0.0
##                                                    
##                                                    
##                 Alpha-1 Antitrypsin 107    0    0.0
##                                                    
##                     Hemachromatosis 107    0    0.0
##                                                    
##                                                    
##  Drug Induced Liver Injury or Toxin 107    0    0.0
##                                                    
##                                                    
##                         Budd Chiari 107    0    0.0
##                                                    
##                         Cryptogenic 107    0    0.0
##                                                    
##                                                    
##                          Malignancy 107    0    0.0
##                                                    
##                                                    
##                               Other 107    0    0.0
##                                                    
##                                                    
##                            Dialysis 107    0    0.0
##                                                    
##                                                    
##                            Pressers 107    0    0.0
##                                                    
##                                                    
##              Mechanical Ventilation 107    0    0.0
##                                                    
##                                                    
##                             level freq percent cum.percent
##  American Indian or Alaska Native    1     0.9         0.9
##              Asian/Mideast Indian    8     7.5         8.4
##            Black/African-American   11    10.3        18.7
##                More than one Race   10     9.3        28.0
##            Other Pacific Islander    0     0.0        28.0
##                  Patient Declined    5     4.7        32.7
##                           Unknown    2     1.9        34.6
##                             White   70    65.4       100.0
##                                                           
##                            Female   47    43.9        43.9
##                              Male   60    56.1       100.0
##                                                           
##                                 0   99    92.5        92.5
##                                 1    8     7.5       100.0
##                                                           
##                                 0   59    55.1        55.1
##                                 1   48    44.9       100.0
##                                                           
##                                 0   91    85.0        85.0
##                                 1   16    15.0       100.0
##                                                           
##                                 0  101    94.4        94.4
##                                 1    6     5.6       100.0
##                                                           
##                                 0  103    96.3        96.3
##                                 1    4     3.7       100.0
##                                                           
##                                 0  106    99.1        99.1
##                                 1    1     0.9       100.0
##                                                           
##                                 0  103    96.3        96.3
##                                 1    4     3.7       100.0
##                                                           
##                                 0  102    95.3        95.3
##                                 1    5     4.7       100.0
##                                                           
##                                 0  104    97.2        97.2
##                                 1    3     2.8       100.0
##                                                           
##                                 0  107   100.0       100.0
##                                                           
##                                 0  106    99.1        99.1
##                                 1    1     0.9       100.0
##                                                           
##                                 0  106    99.1        99.1
##                                 1    1     0.9       100.0
##                                                           
##                                 0  107   100.0       100.0
##                                                           
##                                 0  102    95.3        95.3
##                                 1    5     4.7       100.0
##                                                           
##                                 0   84    78.5        78.5
##                                 1   23    21.5       100.0
##                                                           
##                                 0   94    87.9        87.9
##                                 1   13    12.1       100.0
##                                                           
##                                 0   76    71.0        71.0
##                                 1   31    29.0       100.0
##                                                           
##                                 0   96    89.7        89.7
##                                 1   11    10.3       100.0
##                                                           
##                                 0  100    93.5        93.5
##                                 1    7     6.5       100.0
##                                                           
## ------------------------------------------------------------ 
## cohort: Validation
##                                 var  n miss p.miss
##                                race 35    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 35    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 35    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 35    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 35    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 35    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 35    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 35    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 35    0    0.0
##                                                   
##                                                   
##                          Autoimmune 35    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 35    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 35    0    0.0
##                                                   
##                     Hemachromatosis 35    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 35    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 35    0    0.0
##                                                   
##                         Cryptogenic 35    0    0.0
##                                                   
##                                                   
##                          Malignancy 35    0    0.0
##                                                   
##                                                   
##                               Other 35    0    0.0
##                                                   
##                                                   
##                            Dialysis 35    0    0.0
##                                                   
##                                                   
##                            Pressers 35    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 35    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    1     2.9         2.9
##              Asian/Mideast Indian    0     0.0         2.9
##            Black/African-American    3     8.6        11.4
##                More than one Race    1     2.9        14.3
##            Other Pacific Islander    1     2.9        17.1
##                  Patient Declined    0     0.0        17.1
##                           Unknown    4    11.4        28.6
##                             White   25    71.4       100.0
##                                                           
##                            Female   16    45.7        45.7
##                              Male   19    54.3       100.0
##                                                           
##                                 0   34    97.1        97.1
##                                 1    1     2.9       100.0
##                                                           
##                                 0   21    60.0        60.0
##                                 1   14    40.0       100.0
##                                                           
##                                 0   34    97.1        97.1
##                                 1    1     2.9       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   33    94.3        94.3
##                                 1    2     5.7       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                                           
##                                 0   35   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   34    97.1        97.1
##                                 1    1     2.9       100.0
##                                                           
##                                 0   30    85.7        85.7
##                                 1    5    14.3       100.0
##                                                           
##                                 0   20    57.1        57.1
##                                 1   15    42.9       100.0
##                                                           
##                                 0   26    74.3        74.3
##                                 1    9    25.7       100.0
##                                                           
##                                 0   32    91.4        91.4
##                                 1    3     8.6       100.0
##                                                           
## 
## p-values
##                                       pApprox      pExact
## race                               0.03166089 0.030261150
## sex                                1.00000000 1.000000000
## Alcoholic Hepatitis                0.56591461 0.452490423
## Alcoholic Cirrhosis                0.75891760 0.696336171
## NAFLD/NASH                         0.10660916 0.071552735
## Primary Sclerosing Cholangitis     0.34337432 0.336196590
## Acute Viral Hepatitis              0.56740960 0.572085814
## Chronic Hepatitis B                1.00000000 1.000000000
## Chronic Hepatitis C                0.98368436 0.636320472
## Autoimmune                         0.43907269 0.333322101
## Wilson's Disease                   0.74577750 1.000000000
## Alpha-1 Antitrypsin                        NA          NA
## Hemachromatosis                    1.00000000 1.000000000
## Drug Induced Liver Injury or Toxin 1.00000000 1.000000000
## Budd Chiari                                NA          NA
## Cryptogenic                        0.43907269 0.333322101
## Malignancy                         0.02177662 0.008768824
## Other                              0.97040937 0.772171814
## Dialysis                           0.18828168 0.147658755
## Pressers                           0.04564895 0.045954898
## Mechanical Ventilation             0.97862090 0.708021103
## 
## Standardize mean differences
##                                        1 vs 2
## race                               0.78541034
## sex                                0.03598046
## Alcoholic Hepatitis                0.20983452
## Alcoholic Cirrhosis                0.09844884
## NAFLD/NASH                         0.43460701
## Primary Sclerosing Cholangitis     0.34469099
## Acute Viral Hepatitis              0.27869321
## Chronic Hepatitis B                0.13736056
## Chronic Hepatitis C                0.09321888
## Autoimmune                         0.31311215
## Wilson's Disease                   0.24019223
## Alpha-1 Antitrypsin                0.00000000
## Hemachromatosis                    0.13736056
## Drug Induced Liver Injury or Toxin 0.13736056
## Budd Chiari                        0.00000000
## Cryptogenic                        0.31311215
## Malignancy                         0.59461176
## Other                              0.06310471
## Dialysis                           0.29250434
## Pressers                           0.41011137
## Mechanical Ventilation             0.07683819
cohorts_tableone_skewed <- c("age", "meld_transplant")

cohorts_tab1_2 <- print(cohorts_tab1_1, nonnormal = cohorts_tableone_skewed,
    formatOptions = list(big.mark = ","))
##                                             Stratified by cohort
##                                              Original            
##   n                                            107               
##   race (%)                                                       
##      American Indian or Alaska Native            1 (  0.9)       
##      Asian/Mideast Indian                        8 (  7.5)       
##      Black/African-American                     11 ( 10.3)       
##      More than one Race                         10 (  9.3)       
##      Other Pacific Islander                      0 (  0.0)       
##      Patient Declined                            5 (  4.7)       
##      Unknown                                     2 (  1.9)       
##      White                                      70 ( 65.4)       
##   sex = Male (%)                                60 ( 56.1)       
##   age (median [IQR])                         56.00 [41.50, 64.00]
##   meld_transplant (median [IQR])             29.00 [19.00, 33.00]
##   Alcoholic Hepatitis = 1 (%)                    8 (  7.5)       
##   Alcoholic Cirrhosis = 1 (%)                   48 ( 44.9)       
##   NAFLD/NASH = 1 (%)                            16 ( 15.0)       
##   Primary Sclerosing Cholangitis = 1 (%)         6 (  5.6)       
##   Acute Viral Hepatitis = 1 (%)                  4 (  3.7)       
##   Chronic Hepatitis B = 1 (%)                    1 (  0.9)       
##   Chronic Hepatitis C = 1 (%)                    4 (  3.7)       
##   Autoimmune = 1 (%)                             5 (  4.7)       
##   Wilson's Disease = 1 (%)                       3 (  2.8)       
##   Alpha-1 Antitrypsin = 0 (%)                  107 (100.0)       
##   Hemachromatosis = 1 (%)                        1 (  0.9)       
##   Drug Induced Liver Injury or Toxin = 1 (%)     1 (  0.9)       
##   Budd Chiari = 0 (%)                          107 (100.0)       
##   Cryptogenic = 1 (%)                            5 (  4.7)       
##   Malignancy = 1 (%)                            23 ( 21.5)       
##   Other = 1 (%)                                 13 ( 12.1)       
##   Dialysis = 1 (%)                              31 ( 29.0)       
##   Pressers = 1 (%)                              11 ( 10.3)       
##   Mechanical Ventilation = 1 (%)                 7 (  6.5)       
##                                             Stratified by cohort
##                                              Validation           p     
##   n                                             35                      
##   race (%)                                                         0.032
##      American Indian or Alaska Native            1 (  2.9)              
##      Asian/Mideast Indian                        0 (  0.0)              
##      Black/African-American                      3 (  8.6)              
##      More than one Race                          1 (  2.9)              
##      Other Pacific Islander                      1 (  2.9)              
##      Patient Declined                            0 (  0.0)              
##      Unknown                                     4 ( 11.4)              
##      White                                      25 ( 71.4)              
##   sex = Male (%)                                19 ( 54.3)         1.000
##   age (median [IQR])                         55.00 [43.00, 62.00]  0.581
##   meld_transplant (median [IQR])             28.00 [21.50, 32.50]  0.818
##   Alcoholic Hepatitis = 1 (%)                    1 (  2.9)         0.566
##   Alcoholic Cirrhosis = 1 (%)                   14 ( 40.0)         0.759
##   NAFLD/NASH = 1 (%)                             1 (  2.9)         0.107
##   Primary Sclerosing Cholangitis = 1 (%)         0 (  0.0)         0.343
##   Acute Viral Hepatitis = 1 (%)                  0 (  0.0)         0.567
##   Chronic Hepatitis B = 1 (%)                    0 (  0.0)         1.000
##   Chronic Hepatitis C = 1 (%)                    2 (  5.7)         0.984
##   Autoimmune = 1 (%)                             0 (  0.0)         0.439
##   Wilson's Disease = 1 (%)                       0 (  0.0)         0.746
##   Alpha-1 Antitrypsin = 0 (%)                   35 (100.0)            NA
##   Hemachromatosis = 1 (%)                        0 (  0.0)         1.000
##   Drug Induced Liver Injury or Toxin = 1 (%)     0 (  0.0)         1.000
##   Budd Chiari = 0 (%)                           35 (100.0)            NA
##   Cryptogenic = 1 (%)                            0 (  0.0)         0.439
##   Malignancy = 1 (%)                             1 (  2.9)         0.022
##   Other = 1 (%)                                  5 ( 14.3)         0.970
##   Dialysis = 1 (%)                              15 ( 42.9)         0.188
##   Pressers = 1 (%)                               9 ( 25.7)         0.046
##   Mechanical Ventilation = 1 (%)                 3 (  8.6)         0.979
##                                             Stratified by cohort
##                                              test   
##   n                                                 
##   race (%)                                          
##      American Indian or Alaska Native               
##      Asian/Mideast Indian                           
##      Black/African-American                         
##      More than one Race                             
##      Other Pacific Islander                         
##      Patient Declined                               
##      Unknown                                        
##      White                                          
##   sex = Male (%)                                    
##   age (median [IQR])                         nonnorm
##   meld_transplant (median [IQR])             nonnorm
##   Alcoholic Hepatitis = 1 (%)                       
##   Alcoholic Cirrhosis = 1 (%)                       
##   NAFLD/NASH = 1 (%)                                
##   Primary Sclerosing Cholangitis = 1 (%)            
##   Acute Viral Hepatitis = 1 (%)                     
##   Chronic Hepatitis B = 1 (%)                       
##   Chronic Hepatitis C = 1 (%)                       
##   Autoimmune = 1 (%)                                
##   Wilson's Disease = 1 (%)                          
##   Alpha-1 Antitrypsin = 0 (%)                       
##   Hemachromatosis = 1 (%)                           
##   Drug Induced Liver Injury or Toxin = 1 (%)        
##   Budd Chiari = 0 (%)                               
##   Cryptogenic = 1 (%)                               
##   Malignancy = 1 (%)                                
##   Other = 1 (%)                                     
##   Dialysis = 1 (%)                                  
##   Pressers = 1 (%)                                  
##   Mechanical Ventilation = 1 (%)
write.csv(cohorts_tab1_2, "./Results/Both_Cohorts_Demo_Table_1.csv",
    row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
cohorts_tab1_2_padjust1 <- read.csv("./Results/Both_Cohorts_Demo_Table_1.csv") %>%
    dplyr::rename(` ` = X)

cohorts_tab1_2_padjust2 <- cohorts_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = cohorts_tab1_2_padjust1$` `))

cohorts_tab1_2_padjust3 <- cohorts_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & test == "", "chi.sq", test)) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, cohorts_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(cohorts_tab1_2_padjust3, "./Results/Both_Cohorts_Demo_Table_1_padjust.csv",
    row.names = FALSE)

Antibiotics

cohorts_abx <- vc_abx2 %>%
    mutate(cohort = "Validation") %>%
    bind_rows(abx2 %>%
        mutate(cohort = "Original"))


cohorts_abx_tab1_1 <- CreateTableOne(vars = abx_vars, testNonNormal = "kruskal.test",
    includeNA = FALSE, factorVars = abx_vars, strata = "cohort",
    data = cohorts_abx)

summary(cohorts_abx_tab1_1)
## 
##      ### Summary of categorical variables ### 
## 
## cohort: Original
##                      var   n miss p.miss level freq percent cum.percent
##              Basiliximab 107    0    0.0     0   37    34.6        34.6
##                                              1   70    65.4       100.0
##                                                                        
##            Mycophenolate 107    0    0.0     0   21    19.6        19.6
##                                              1   86    80.4       100.0
##                                                                        
##                  Steroid 107    0    0.0     1  107   100.0       100.0
##                                                                        
##      Systemic Vancomycin 107    0    0.0     0   44    41.1        41.1
##                                              1   63    58.9       100.0
##                                                                        
##               Tacrolimus 107    0    0.0     0    1     0.9         0.9
##                                              1  106    99.1       100.0
##                                                                        
##                 Cefepime 107    0    0.0     0   65    60.7        60.7
##                                              1   42    39.3       100.0
##                                                                        
##            Metronidazole 107    0    0.0     0   58    54.2        54.2
##                                              1   49    45.8       100.0
##                                                                        
##  Piperacillin/Tazobactam 107    0    0.0     0   14    13.1        13.1
##                                              1   93    86.9       100.0
##                                                                        
##                Rifaximin 107    0    0.0     0   54    50.5        50.5
##                                              1   53    49.5       100.0
##                                                                        
##              Ceftriaxone 107    0    0.0     0   81    75.7        75.7
##                                              1   26    24.3       100.0
##                                                                        
##            Ciprofloxacin 107    0    0.0     0   87    81.3        81.3
##                                              1   20    18.7       100.0
##                                                                        
##               Gentamicin 107    0    0.0     0  105    98.1        98.1
##                                              1    2     1.9       100.0
##                                                                        
##               Tobramicin 107    0    0.0     0  100    93.5        93.5
##                                              1    7     6.5       100.0
##                                                                        
##               Daptomycin 107    0    0.0     0  103    96.3        96.3
##                                              1    4     3.7       100.0
##                                                                        
##                Meropenem 107    0    0.0     0  102    95.3        95.3
##                                              1    5     4.7       100.0
##                                                                        
##          Oral Vancomycin 107    0    0.0     0  104    97.2        97.2
##                                              1    3     2.8       100.0
##                                                                        
## ------------------------------------------------------------ 
## cohort: Validation
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 35    0    0.0     0   13    37.1        37.1
##                                             1   22    62.9       100.0
##                                                                       
##            Mycophenolate 35    0    0.0     0    5    14.3        14.3
##                                             1   30    85.7       100.0
##                                                                       
##                  Steroid 35    0    0.0     1   35   100.0       100.0
##                                                                       
##      Systemic Vancomycin 35    0    0.0     0   11    31.4        31.4
##                                             1   24    68.6       100.0
##                                                                       
##               Tacrolimus 35    0    0.0     0    0     0.0         0.0
##                                             1   35   100.0       100.0
##                                                                       
##                 Cefepime 35    0    0.0     0   21    60.0        60.0
##                                             1   14    40.0       100.0
##                                                                       
##            Metronidazole 35    0    0.0     0   15    42.9        42.9
##                                             1   20    57.1       100.0
##                                                                       
##  Piperacillin/Tazobactam 35    0    0.0     0    7    20.0        20.0
##                                             1   28    80.0       100.0
##                                                                       
##                Rifaximin 35    0    0.0     0   14    40.0        40.0
##                                             1   21    60.0       100.0
##                                                                       
##              Ceftriaxone 35    0    0.0     0   26    74.3        74.3
##                                             1    9    25.7       100.0
##                                                                       
##            Ciprofloxacin 35    0    0.0     0   27    77.1        77.1
##                                             1    8    22.9       100.0
##                                                                       
##               Gentamicin 35    0    0.0     0   34    97.1        97.1
##                                             1    1     2.9       100.0
##                                                                       
##               Tobramicin 35    0    0.0     0   25    71.4        71.4
##                                             1   10    28.6       100.0
##                                                                       
##               Daptomycin 35    0    0.0     0   33    94.3        94.3
##                                             1    2     5.7       100.0
##                                                                       
##                Meropenem 35    0    0.0     0   34    97.1        97.1
##                                             1    1     2.9       100.0
##                                                                       
##          Oral Vancomycin 35    0    0.0     0   34    97.1        97.1
##                                             1    1     2.9       100.0
##                                                                       
## 
## p-values
##                             pApprox      pExact
## Basiliximab             0.942780159 0.839501517
## Mycophenolate           0.647386484 0.617315706
## Steroid                          NA          NA
## Systemic Vancomycin     0.411089474 0.326510288
## Tacrolimus              1.000000000 1.000000000
## Cefepime                1.000000000 1.000000000
## Metronidazole           0.331418773 0.330128924
## Piperacillin/Tazobactam 0.467698201 0.409791463
## Rifaximin               0.378233199 0.332115186
## Ceftriaxone             1.000000000 1.000000000
## Ciprofloxacin           0.769554057 0.627387810
## Gentamicin              1.000000000 1.000000000
## Tobramicin              0.001447514 0.001400685
## Daptomycin              0.983684356 0.636320472
## Meropenem               1.000000000 1.000000000
## Oral Vancomycin         1.000000000 1.000000000
## 
## Standardize mean differences
##                              1 vs 2
## Basiliximab             0.053468977
## Mycophenolate           0.142680721
## Steroid                 0.000000000
## Systemic Vancomycin     0.202634683
## Tacrolimus              0.137360564
## Cefepime                0.015286339
## Metronidazole           0.228545001
## Piperacillin/Tazobactam 0.186942613
## Rifaximin               0.211475899
## Ceftriaxone             0.032684568
## Ciprofloxacin           0.102813256
## Gentamicin              0.065076934
## Tobramicin              0.604939677
## Daptomycin              0.093218882
## Meropenem               0.095499590
## Oral Vancomycin         0.003220228
cohorts_abx_tab1_2 <- print(cohorts_abx_tab1_1, formatOptions = list(big.mark = ","))
##                                  Stratified by cohort
##                                   Original     Validation  p      test
##   n                               107          35                     
##   Basiliximab = 1 (%)              70 ( 65.4)  22 ( 62.9)   0.943     
##   Mycophenolate = 1 (%)            86 ( 80.4)  30 ( 85.7)   0.647     
##   Steroid = 1 (%)                 107 (100.0)  35 (100.0)      NA     
##   Systemic Vancomycin = 1 (%)      63 ( 58.9)  24 ( 68.6)   0.411     
##   Tacrolimus = 1 (%)              106 ( 99.1)  35 (100.0)   1.000     
##   Cefepime = 1 (%)                 42 ( 39.3)  14 ( 40.0)   1.000     
##   Metronidazole = 1 (%)            49 ( 45.8)  20 ( 57.1)   0.331     
##   Piperacillin/Tazobactam = 1 (%)  93 ( 86.9)  28 ( 80.0)   0.468     
##   Rifaximin = 1 (%)                53 ( 49.5)  21 ( 60.0)   0.378     
##   Ceftriaxone = 1 (%)              26 ( 24.3)   9 ( 25.7)   1.000     
##   Ciprofloxacin = 1 (%)            20 ( 18.7)   8 ( 22.9)   0.770     
##   Gentamicin = 1 (%)                2 (  1.9)   1 (  2.9)   1.000     
##   Tobramicin = 1 (%)                7 (  6.5)  10 ( 28.6)   0.001     
##   Daptomycin = 1 (%)                4 (  3.7)   2 (  5.7)   0.984     
##   Meropenem = 1 (%)                 5 (  4.7)   1 (  2.9)   1.000     
##   Oral Vancomycin = 1 (%)           3 (  2.8)   1 (  2.9)   1.000
write.csv(cohorts_abx_tab1_2, "./Results/Both_Cohorts_ABX_Table_1.csv",
    row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
cohorts_abx_tab1_2_padjust1 <- read.csv("./Results/Both_Cohorts_ABX_Table_1.csv") %>%
    dplyr::rename(` ` = X)

cohorts_abx_tab1_2_padjust2 <- cohorts_abx_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = cohorts_abx_tab1_2_padjust1$` `))

cohorts_abx_tab1_2_padjust3 <- cohorts_abx_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & is.na(test), "chi.sq", "")) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, cohorts_abx_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(cohorts_abx_tab1_2_padjust3, "./Results/Both_Cohorts_ABX_Table_1_padjust.csv",
    row.names = FALSE)

Save Data Image

save.image(file = "./Data/LT_Modeling.RData")